From b333242b51b14810850eb13a7ae2e0b6a09a50c7 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 09:39:41 +0800 Subject: [PATCH 01/32] introduce schema_name Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 36 +++++++++++++++++-- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/expr/src/expr_rewriter/order_by.rs | 2 +- datafusion/expr/src/expr_schema.rs | 11 +++++- datafusion/expr/src/logical_plan/builder.rs | 12 ++++--- datafusion/expr/src/logical_plan/plan.rs | 2 +- datafusion/expr/src/utils.rs | 2 +- .../optimizer/src/common_subexpr_eliminate.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 6 ++-- .../optimizer/src/optimize_projections/mod.rs | 2 +- datafusion/optimizer/src/push_down_filter.rs | 4 +-- .../optimizer/src/scalar_subquery_to_join.rs | 4 +-- .../src/single_distinct_to_groupby.rs | 2 +- datafusion/sql/src/unparser/utils.rs | 2 +- datafusion/sql/src/utils.rs | 4 +-- 15 files changed, 67 insertions(+), 26 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 708843494814..bf194ec4f189 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -573,6 +573,12 @@ pub struct Cast { pub data_type: DataType, } +impl fmt::Display for Cast { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "CAST({} AS {:?})", self.expr, self.data_type) + } +} + impl Cast { /// Create a new Cast expression pub fn new(expr: Box, data_type: DataType) -> Self { @@ -1000,10 +1006,28 @@ impl PartialOrd for Expr { impl Expr { /// Returns the name of this expression as it should appear in a schema. This name /// will not include any CAST expressions. + /// TODO: Replace this with Display pub fn display_name(&self) -> Result { create_name(self) } + /// Returns the name for schema. + /// 1. Cast is not shown + /// 2. Alias doesn't contain expr + pub fn schema_name(&self) -> Result { + match self { + Expr::Column(c) => Ok(format!("{c}")), + Expr::Alias(Alias { name, .. }) => { + Ok(name.to_owned()) + } + Expr::Cast(Cast { expr, data_type: _ }) => { + expr.display_name() + } + _ => self.display_name() + } + + } + /// Returns a full and complete string representation of this expression. pub fn canonical_name(&self) -> String { format!("{self}") @@ -1134,7 +1158,9 @@ impl Expr { match self { // call Expr::display_name() on a Expr::Sort will throw an error Expr::Sort(Sort { expr, .. }) => expr.name_for_alias(), - expr => expr.display_name(), + // TODO: we should use shema's name instead of display_name + // expr => expr.display_name(), + expr => expr.schema_name(), } } @@ -1142,7 +1168,6 @@ impl Expr { /// alias if necessary. pub fn alias_if_changed(self, original_name: String) -> Result { let new_name = self.name_for_alias()?; - if new_name == original_name { return Ok(self); } @@ -2100,9 +2125,14 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { } w.write_str("END")?; } - Expr::Cast(Cast { expr, .. }) => { + // Expr::Cast(c) => { + Expr::Cast(Cast { expr, data_type: _ }) => { // CAST does not change the expression name + + // TODO: display column + // write!(w, "{c}")?; write_name(w, expr)?; + } Expr::TryCast(TryCast { expr, .. }) => { // CAST does not change the expression name diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index bf2bfe2c3932..bf63e1b4ac79 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -173,7 +173,7 @@ pub fn create_col_from_scalar_expr( name, )), _ => { - let scalar_column = scalar_expr.display_name()?; + let scalar_column = scalar_expr.schema_name()?; Ok(Column::new( Some::(subqry_alias.into()), scalar_column, diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 2efdcae1a790..aac7a4780daf 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -109,7 +109,7 @@ fn rewrite_in_terms_of_projection( // expr is an actual expr like min(t.c2), but we are looking // for a column with the same "MIN(C2)", so translate there - let name = normalized_expr.display_name()?; + let name = normalized_expr.schema_name()?; let search_col = Expr::Column(Column { relation: None, diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 6344b892adb7..a11a3d30db2d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -474,11 +474,20 @@ impl ExprSchemable for Expr { .into(), )) } + Expr::Cast(Cast { expr, data_type: _ }) => { + let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; + Ok(( + None, + Field::new(expr.schema_name()?, data_type, nullable) + .with_metadata(self.metadata(input_schema)?) + .into(), + )) + } _ => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( None, - Field::new(self.display_name()?, data_type, nullable) + Field::new(self.schema_name()?, data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 736310c7ac0f..66a2aefa55bf 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1298,7 +1298,7 @@ fn add_group_by_exprs_from_dependencies( // c1 + 1` produces an output field named `"c1 + 1"` let mut group_by_field_names = group_expr .iter() - .map(|e| e.display_name()) + .map(|e| e.schema_name()) .collect::>>()?; if let Some(target_indices) = @@ -1306,7 +1306,7 @@ fn add_group_by_exprs_from_dependencies( { for idx in target_indices { let expr = Expr::Column(Column::from(schema.qualified_field(idx))); - let expr_name = expr.display_name()?; + let expr_name = expr.schema_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); group_expr.push(expr); @@ -1323,7 +1323,7 @@ pub(crate) fn validate_unique_names<'a>( let mut unique_names = HashMap::new(); expressions.into_iter().enumerate().try_for_each(|(position, expr)| { - let name = expr.display_name()?; + let name = expr.schema_name()?; match unique_names.get(&name) { None => { unique_names.insert(name, (position, expr)); @@ -1456,7 +1456,9 @@ pub fn project( input_schema, None, )?), - _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?), + _ => { + projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?) + } } } validate_unique_names("Projections", projected_expr.iter())?; @@ -1557,7 +1559,7 @@ pub fn wrap_projection_for_join_if_necessary( if let Some(col) = key.try_as_col() { Ok(col.clone()) } else { - let name = key.display_name()?; + let name = key.schema_name()?; Ok(Column::from_name(name)) } }) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 6bea1ad948a1..aa7f986a6eae 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2727,7 +2727,7 @@ fn calc_func_dependencies_for_aggregate( if !contains_grouping_set(group_expr) { let group_by_expr_names = group_expr .iter() - .map(|item| item.display_name()) + .map(|item| item.schema_name()) .collect::>>()?; let aggregate_func_dependencies = aggregate_functional_dependencies( input.schema(), diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 683a8e170ed4..07cd3dedff10 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -798,7 +798,7 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; Ok(Expr::from(Column::from((qualifier, field)))) } - _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), + _ => Ok(Expr::Column(Column::from_name(expr.schema_name()?))), } } diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 70ca6f5304ad..8998c468cb40 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1108,7 +1108,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_, '_> { self.down_index += 1; } - let expr_name = expr.display_name()?; + let expr_name = expr.schema_name()?; let (_, expr_alias) = self.common_exprs.entry(expr_id).or_insert_with(|| { let expr_alias = self.alias_generator.next(CSE_PREFIX); diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 6dbf1641bd7c..380dddc73654 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -458,7 +458,7 @@ fn agg_exprs_evaluation_result_on_empty_batch( let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { - expr_result_map_for_count_bug.insert(e.display_name()?, result_expr); + expr_result_map_for_count_bug.insert(e.schema_name()?, result_expr); } } Ok(()) @@ -496,7 +496,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), Expr::Column(Column { relation: _, name }) => name.to_string(), - _ => expr.display_name()?, + _ => expr.schema_name()?, }; expr_result_map_for_count_bug.insert(expr_name, result_expr); } @@ -553,7 +553,7 @@ fn filter_exprs_evaluation_result_on_empty_batch( else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))), }); expr_result_map_for_count_bug - .insert(new_expr.display_name()?, new_expr); + .insert(new_expr.schema_name()?, new_expr); } None } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 31d59da13323..f863b3ba8cf0 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -135,7 +135,7 @@ fn optimize_projections( let group_by_expr_existing = aggregate .group_expr .iter() - .map(|group_by_expr| group_by_expr.display_name()) + .map(|group_by_expr| group_by_expr.schema_name()) .collect::>>()?; let new_group_bys = if let Some(simplest_groupby_indices) = diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index f9c9ec961c8e..7be997c844af 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -817,7 +817,7 @@ impl OptimizerRule for PushDownFilter { let group_expr_columns = agg .group_expr .iter() - .map(|e| Ok(Column::from_qualified_name(e.display_name()?))) + .map(|e| Ok(Column::from_qualified_name(e.schema_name()?))) .collect::>>()?; let predicates = split_conjunction_owned(filter.predicate.clone()); @@ -838,7 +838,7 @@ impl OptimizerRule for PushDownFilter { // So we need create a replace_map, add {`a+b` --> Expr(Column(a)+Column(b))} let mut replace_map = HashMap::new(); for expr in &agg.group_expr { - replace_map.insert(expr.display_name()?, expr.clone()); + replace_map.insert(expr.schema_name()?, expr.clone()); } let replaced_push_predicates = push_predicates .into_iter() diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index fbec675f6fc4..3e8cf77130f1 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -188,9 +188,9 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut proj_exprs = vec![]; for expr in projection.expr.iter() { - let old_expr_name = expr.display_name()?; + let old_expr_name = expr.schema_name()?; let new_expr = expr_to_rewrite_expr_map.get(expr).unwrap(); - let new_expr_name = new_expr.display_name()?; + let new_expr_name = new_expr.schema_name()?; if new_expr_name != old_expr_name { proj_exprs.push(new_expr.clone().alias(old_expr_name)) } else { diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 69c1b505727d..7c77772969dc 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -195,7 +195,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { } let arg = args.swap_remove(0); - if group_fields_set.insert(arg.display_name()?) { + if group_fields_set.insert(arg.schema_name()?) { inner_group_exprs .push(arg.alias(SINGLE_DISTINCT_ALIAS)); } diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index 71f64f1cf459..8d6af529cb0d 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -115,7 +115,7 @@ pub(crate) fn unproject_window_exprs(expr: &Expr, windows: &[&Window]) -> Result if let Some(unproj) = windows .iter() .flat_map(|w| w.window_expr.iter()) - .find(|window_expr| window_expr.display_name().unwrap() == c.name) + .find(|window_expr| window_expr.schema_name().unwrap() == c.name) { Ok(Transformed::yes(unproj.clone())) } else { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 3b044646e6cb..eef8763024a1 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -329,7 +329,7 @@ pub(crate) fn transform_bottom_unnest( // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection // inside unnest execution, each column inside the inner projection // will be transformed into new columns. Thus we need to keep track of these placeholding column names - let placeholder_name = unnest_expr.display_name()?; + let placeholder_name = unnest_expr.schema_name()?; unnest_placeholder_columns.push(placeholder_name.clone()); // Add alias for the argument expression, to avoid naming conflicts @@ -402,7 +402,7 @@ pub(crate) fn transform_bottom_unnest( } else { // We need to evaluate the expr in the inner projection, // outer projection just select its name - let column_name = transformed_expr.display_name()?; + let column_name = transformed_expr.schema_name()?; inner_projection_exprs.push(transformed_expr); Ok(vec![Expr::Column(Column::from_name(column_name))]) } From 16fd0aa98bff5cd088ad4c201761eaed802370ea Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 09:57:49 +0800 Subject: [PATCH 02/32] fix Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 25 +++++++------------ datafusion/expr/src/expr_rewriter/mod.rs | 8 +++--- datafusion/expr/src/logical_plan/builder.rs | 4 +-- .../optimizer/src/optimize_projections/mod.rs | 4 +-- .../tests/cases/roundtrip_logical_plan.rs | 4 ++- datafusion/proto/tests/cases/serialize.rs | 2 +- .../substrait/src/logical_plan/consumer.rs | 2 +- 7 files changed, 21 insertions(+), 28 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index bf194ec4f189..85daa5e36d10 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1006,26 +1006,22 @@ impl PartialOrd for Expr { impl Expr { /// Returns the name of this expression as it should appear in a schema. This name /// will not include any CAST expressions. - /// TODO: Replace this with Display + /// TODO: Deprecate it and use Display pub fn display_name(&self) -> Result { create_name(self) } - /// Returns the name for schema. - /// 1. Cast is not shown - /// 2. Alias doesn't contain expr + /// Returns the name for schema / field that is different from display pub fn schema_name(&self) -> Result { match self { Expr::Column(c) => Ok(format!("{c}")), - Expr::Alias(Alias { name, .. }) => { - Ok(name.to_owned()) - } - Expr::Cast(Cast { expr, data_type: _ }) => { - expr.display_name() - } - _ => self.display_name() + // expr is not shown since it is aliased + Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), + // cast expr is not shown to be consistant with Postgres and Spark + Expr::Cast(Cast { expr, data_type: _ }) => expr.display_name(), + // Most of the expr has no difference + _ => self.display_name(), } - } /// Returns a full and complete string representation of this expression. @@ -1158,8 +1154,6 @@ impl Expr { match self { // call Expr::display_name() on a Expr::Sort will throw an error Expr::Sort(Sort { expr, .. }) => expr.name_for_alias(), - // TODO: we should use shema's name instead of display_name - // expr => expr.display_name(), expr => expr.schema_name(), } } @@ -2132,7 +2126,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { // TODO: display column // write!(w, "{c}")?; write_name(w, expr)?; - } Expr::TryCast(TryCast { expr, .. }) => { // CAST does not change the expression name @@ -2353,7 +2346,7 @@ mod test { assert_eq!(expected_canonical, format!("{expr}")); // note that CAST intentionally has a name that is different from its `Display` // representation. CAST does not change the name of expressions. - assert_eq!("Float32(1.23)", expr.display_name()?); + assert_eq!("Float32(1.23)", expr.schema_name()?); Ok(()) } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index bf63e1b4ac79..fe2940314647 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -475,14 +475,14 @@ mod test { let expr = rewrite_preserving_name(expr_from.clone(), &mut rewriter).unwrap(); let original_name = match &expr_from { - Expr::Sort(Sort { expr, .. }) => expr.display_name(), - expr => expr.display_name(), + Expr::Sort(Sort { expr, .. }) => expr.schema_name(), + expr => expr.schema_name(), } .unwrap(); let new_name = match &expr { - Expr::Sort(Sort { expr, .. }) => expr.display_name(), - expr => expr.display_name(), + Expr::Sort(Sort { expr, .. }) => expr.schema_name(), + expr => expr.schema_name(), } .unwrap(); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 66a2aefa55bf..7b96493d53fe 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1456,9 +1456,7 @@ pub fn project( input_schema, None, )?), - _ => { - projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?) - } + _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?), } } validate_unique_names("Projections", projected_expr.iter())?; diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index f863b3ba8cf0..bb5c4e2c5c22 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -1928,8 +1928,8 @@ mod tests { WindowFunctionDefinition::AggregateUDF(max_udaf()), vec![col("test.b")], )); - let col1 = col(max1.display_name()?); - let col2 = col(max2.display_name()?); + let col1 = col(max1.schema_name()?); + let col2 = col(max2.schema_name()?); let plan = LogicalPlanBuilder::from(table_scan) .window(vec![max1])? diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index d150c474e88f..cd06759cf231 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -592,7 +592,9 @@ async fn roundtrip_logical_plan_copy_to_parquet() -> Result<()> { // Set specific Parquet format options let mut key_value_metadata = HashMap::new(); key_value_metadata.insert("test".to_string(), Some("test".to_string())); - parquet_format.key_value_metadata = key_value_metadata.clone(); + parquet_format + .key_value_metadata + .clone_from(&key_value_metadata); parquet_format.global.allow_single_file_parallelism = false; parquet_format.global.created_by = "test".to_string(); diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs index cc683e778ebc..7fda9a2550df 100644 --- a/datafusion/proto/tests/cases/serialize.rs +++ b/datafusion/proto/tests/cases/serialize.rs @@ -276,7 +276,7 @@ fn test_expression_serialization_roundtrip() { /// Extracts the first part of a function name /// 'foo(bar)' -> 'foo' fn extract_function_name(expr: &Expr) -> String { - let name = expr.display_name().unwrap(); + let name = expr.schema_name().unwrap(); name.split('(').next().unwrap().to_string() } } diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 89f2efec66aa..10154edfdfd7 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -417,7 +417,7 @@ pub async fn from_substrait_rel( } // Ensure the expression has a unique display name, so that project's // validate_unique_names doesn't fail - let name = x.display_name()?; + let name = x.schema_name()?; let mut new_name = name.clone(); let mut i = 0; while names.contains(&new_name) { From 6923d84e2d139e2adab151fd4d1ee6a884c26433 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 10:09:57 +0800 Subject: [PATCH 03/32] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 85daa5e36d10..66dbc981f2cd 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -573,12 +573,6 @@ pub struct Cast { pub data_type: DataType, } -impl fmt::Display for Cast { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "CAST({} AS {:?})", self.expr, self.data_type) - } -} - impl Cast { /// Create a new Cast expression pub fn new(expr: Box, data_type: DataType) -> Self { @@ -2047,8 +2041,8 @@ pub(crate) fn create_name(e: &Expr) -> Result { fn write_name(w: &mut W, e: &Expr) -> Result<()> { match e { Expr::Alias(Alias { name, .. }) => write!(w, "{}", name)?, - Expr::Column(c) => write!(w, "{}", c.flat_name())?, - Expr::OuterReferenceColumn(_, c) => write!(w, "outer_ref({})", c.flat_name())?, + Expr::Column(c) => write!(w, "{c}")?, + Expr::OuterReferenceColumn(_, c) => write!(w, "outer_ref({c})")?, Expr::ScalarVariable(_, variable_names) => { write!(w, "{}", variable_names.join("."))? } @@ -2119,12 +2113,8 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { } w.write_str("END")?; } - // Expr::Cast(c) => { Expr::Cast(Cast { expr, data_type: _ }) => { // CAST does not change the expression name - - // TODO: display column - // write!(w, "{c}")?; write_name(w, expr)?; } Expr::TryCast(TryCast { expr, .. }) => { From 9ff186930a7b14db000927bffbcb2820cf304d43 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 10:13:49 +0800 Subject: [PATCH 04/32] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 4 ++-- datafusion/expr/src/expr_schema.rs | 9 --------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 66dbc981f2cd..56bf9374810e 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1012,7 +1012,7 @@ impl Expr { // expr is not shown since it is aliased Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), // cast expr is not shown to be consistant with Postgres and Spark - Expr::Cast(Cast { expr, data_type: _ }) => expr.display_name(), + Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), // Most of the expr has no difference _ => self.display_name(), } @@ -2113,7 +2113,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { } w.write_str("END")?; } - Expr::Cast(Cast { expr, data_type: _ }) => { + Expr::Cast(Cast { expr, .. }) => { // CAST does not change the expression name write_name(w, expr)?; } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index a11a3d30db2d..4b01796770e7 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -474,15 +474,6 @@ impl ExprSchemable for Expr { .into(), )) } - Expr::Cast(Cast { expr, data_type: _ }) => { - let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok(( - None, - Field::new(expr.schema_name()?, data_type, nullable) - .with_metadata(self.metadata(input_schema)?) - .into(), - )) - } _ => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( From 881bccdaa78b454babd447067ec908997180adaa Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 10:16:34 +0800 Subject: [PATCH 05/32] fix doc Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 56bf9374810e..e969d7e8b29d 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1000,7 +1000,7 @@ impl PartialOrd for Expr { impl Expr { /// Returns the name of this expression as it should appear in a schema. This name /// will not include any CAST expressions. - /// TODO: Deprecate it and use Display + // TODO: Deprecate it and use Display pub fn display_name(&self) -> Result { create_name(self) } From d64ca1ebb597f4c2a9db28f27bf665ca6ae150ed Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 10:37:15 +0800 Subject: [PATCH 06/32] reuse for simple case Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index e969d7e8b29d..ab8c9f493fac 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -2040,13 +2040,15 @@ pub(crate) fn create_name(e: &Expr) -> Result { fn write_name(w: &mut W, e: &Expr) -> Result<()> { match e { + // Reuse Display trait + Expr::Column(_) + | Expr::Literal(_) + | Expr::Placeholder(_) + | Expr::OuterReferenceColumn(..) + | Expr::ScalarVariable(..) | + Expr::Wildcard { .. }=> write!(w, "{e}")?, + Expr::Alias(Alias { name, .. }) => write!(w, "{}", name)?, - Expr::Column(c) => write!(w, "{c}")?, - Expr::OuterReferenceColumn(_, c) => write!(w, "outer_ref({c})")?, - Expr::ScalarVariable(_, variable_names) => { - write!(w, "{}", variable_names.join("."))? - } - Expr::Literal(value) => write!(w, "{value:?}")?, Expr::BinaryExpr(binary_expr) => { write_name(w, binary_expr.left.as_ref())?; write!(w, " {} ", binary_expr.op)?; @@ -2277,15 +2279,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { Expr::Sort { .. } => { return internal_err!("Create name does not support sort expression") } - Expr::Wildcard { qualifier } => match qualifier { - Some(qualifier) => { - return internal_err!( - "Create name does not support qualified wildcard, got {qualifier}" - ) - } - None => write!(w, "*")?, - }, - Expr::Placeholder(Placeholder { id, .. }) => write!(w, "{}", id)?, }; Ok(()) } From 92d6fa0f170c89cd7c38894547e337b383d11fa9 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 12:10:44 +0800 Subject: [PATCH 07/32] unnest + udf Signed-off-by: jayzhan211 --- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr.rs | 8 +++++++- datafusion/expr/src/udf.rs | 16 +++++++++++++++- datafusion/functions-nested/src/extract.rs | 4 ++-- datafusion/functions/src/core/getfield.rs | 4 ++-- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 03e20b886e2c..44b2e6b2d6b7 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -213,7 +213,7 @@ fn create_physical_name(e: &Expr, is_first_expr: bool) -> Result { let expr = create_physical_name(expr, false)?; Ok(format!("{expr} IS NOT UNKNOWN")) } - Expr::ScalarFunction(fun) => fun.func.display_name(&fun.args), + Expr::ScalarFunction(fun) => fun.func.schema_name(&fun.args), Expr::WindowFunction(WindowFunction { fun, args, diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ab8c9f493fac..044bcd1c703b 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1013,6 +1013,12 @@ impl Expr { Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), + Expr::ScalarFunction(ScalarFunction { func, args }) => { + func.schema_name(args) + } + Expr::Unnest(Unnest { expr }) => { + Ok(format!("unnest({})", expr.schema_name()?)) + } // Most of the expr has no difference _ => self.display_name(), } @@ -2048,7 +2054,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::ScalarVariable(..) | Expr::Wildcard { .. }=> write!(w, "{e}")?, - Expr::Alias(Alias { name, .. }) => write!(w, "{}", name)?, + Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, Expr::BinaryExpr(binary_expr) => { write_name(w, binary_expr.left.as_ref())?; write!(w, " {} ", binary_expr.op)?; diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 5ba6e3007ead..8135a3a3bfb8 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -154,6 +154,13 @@ impl ScalarUDF { self.inner.display_name(args) } + /// Returns this function's schema_name. + /// + /// See [`ScalarUDFImpl::schema_name`] for more details + pub fn schema_name(&self, args: &[Expr]) -> Result { + self.inner.schema_name(args) + } + /// Returns the aliases for this function. /// /// See [`ScalarUDF::with_aliases`] for more details @@ -345,12 +352,19 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { fn name(&self) -> &str; /// Returns the user-defined display name of the UDF given the arguments - /// fn display_name(&self, args: &[Expr]) -> Result { let names: Vec = args.iter().map(create_name).collect::>()?; + // TODO: join with ", " to standardize the formatting of Vec Ok(format!("{}({})", self.name(), names.join(","))) } + /// Returns the user-defined schema name of the UDF given the arguments + fn schema_name(&self, args: &[Expr]) -> Result { + let args_name = args.iter().map(Expr::schema_name).collect::>>()?; + // TODO: join with ", " to standardize the formatting of Vec + Ok(format!("{}({})", self.name(), args_name.join(","))) + } + /// Returns the function's [`Signature`] for information about what input /// types are accepted and the function's Volatility. fn signature(&self) -> &Signature; diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index af4e36926b68..aab9933f6c31 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -96,7 +96,7 @@ impl ScalarUDFImpl for ArrayElement { "array_element" } - fn display_name(&self, args: &[Expr]) -> Result { + fn schema_name(&self, args: &[Expr]) -> Result { Ok(format!( "{}[{}]", get_arg_name(args, 0), @@ -253,7 +253,7 @@ impl ScalarUDFImpl for ArraySlice { self } - fn display_name(&self, args: &[Expr]) -> Result { + fn schema_name(&self, args: &[Expr]) -> Result { Ok(format!( "{}[{}]", get_arg_name(args, 0), diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 2c2e36b91b13..139d20901811 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -57,7 +57,7 @@ impl ScalarUDFImpl for GetFieldFunc { "get_field" } - fn display_name(&self, args: &[Expr]) -> Result { + fn schema_name(&self, args: &[Expr]) -> Result { if args.len() != 2 { return exec_err!( "get_field function requires 2 arguments, got {}", @@ -74,7 +74,7 @@ impl ScalarUDFImpl for GetFieldFunc { } }; - Ok(format!("{}[{}]", args[0].display_name()?, name)) + Ok(format!("{}[{}]", args[0].schema_name()?, name)) } fn signature(&self) -> &Signature { From 9f77b0ea58d16425da990fa25cf6151b6837073c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 12:11:23 +0800 Subject: [PATCH 08/32] fmt Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 8 +++----- datafusion/expr/src/udf.rs | 5 ++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 044bcd1c703b..617ab39b8f27 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1013,9 +1013,7 @@ impl Expr { Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), - Expr::ScalarFunction(ScalarFunction { func, args }) => { - func.schema_name(args) - } + Expr::ScalarFunction(ScalarFunction { func, args }) => func.schema_name(args), Expr::Unnest(Unnest { expr }) => { Ok(format!("unnest({})", expr.schema_name()?)) } @@ -2051,8 +2049,8 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::Literal(_) | Expr::Placeholder(_) | Expr::OuterReferenceColumn(..) - | Expr::ScalarVariable(..) | - Expr::Wildcard { .. }=> write!(w, "{e}")?, + | Expr::ScalarVariable(..) + | Expr::Wildcard { .. } => write!(w, "{e}")?, Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, Expr::BinaryExpr(binary_expr) => { diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 8135a3a3bfb8..04a9c7bbbf9f 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -360,7 +360,10 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Returns the user-defined schema name of the UDF given the arguments fn schema_name(&self, args: &[Expr]) -> Result { - let args_name = args.iter().map(Expr::schema_name).collect::>>()?; + let args_name = args + .iter() + .map(Expr::schema_name) + .collect::>>()?; // TODO: join with ", " to standardize the formatting of Vec Ok(format!("{}({})", self.name(), args_name.join(","))) } From d324fb8c00761c9fedba2bf118ca800466bfe6cc Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 13:40:43 +0800 Subject: [PATCH 09/32] add display name for udf Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 5 ++++- datafusion/functions-nested/src/extract.rs | 8 ++++++++ datafusion/functions/src/core/getfield.rs | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 617ab39b8f27..e6a5bc7b486b 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1859,6 +1859,10 @@ impl fmt::Display for Expr { Expr::ScalarFunction(fun) => { fmt_function(f, fun.name(), false, &fun.args, true) } + // TODO: use udf's display_name + // Expr::ScalarFunction(s) => { + // write!(f, "{}", func.display_name(args).unwrap()) + // } Expr::WindowFunction(WindowFunction { fun, args, @@ -2011,7 +2015,6 @@ fn fmt_function( false => args.iter().map(|arg| format!("{arg:?}")).collect(), }; - // let args: Vec = args.iter().map(|arg| format!("{:?}", arg)).collect(); let distinct_str = match distinct { true => "DISTINCT ", false => "", diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index aab9933f6c31..7a144cb99380 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -96,6 +96,10 @@ impl ScalarUDFImpl for ArrayElement { "array_element" } + fn display_name(&self, args: &[Expr]) -> Result { + self.schema_name(args) + } + fn schema_name(&self, args: &[Expr]) -> Result { Ok(format!( "{}[{}]", @@ -253,6 +257,10 @@ impl ScalarUDFImpl for ArraySlice { self } + fn display_name(&self, args: &[Expr]) -> Result { + self.schema_name(args) + } + fn schema_name(&self, args: &[Expr]) -> Result { Ok(format!( "{}[{}]", diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 139d20901811..9cfc24f63524 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -57,6 +57,10 @@ impl ScalarUDFImpl for GetFieldFunc { "get_field" } + fn display_name(&self, args: &[Expr]) -> Result { + self.schema_name(args) + } + fn schema_name(&self, args: &[Expr]) -> Result { if args.len() != 2 { return exec_err!( From 8760d56c5a1a5b73bc04b53f24e4c52663ae2099 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 14:42:02 +0800 Subject: [PATCH 10/32] fix name in udf Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 69 +++++++------------- datafusion/functions-nested/src/extract.rs | 47 ++++++++----- datafusion/functions-nested/src/utils.rs | 7 +- datafusion/functions/src/core/getfield.rs | 18 ++++- datafusion/sqllogictest/test_files/array.slt | 4 +- 5 files changed, 76 insertions(+), 69 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index e6a5bc7b486b..69e76998538e 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1008,15 +1008,27 @@ impl Expr { /// Returns the name for schema / field that is different from display pub fn schema_name(&self) -> Result { match self { - Expr::Column(c) => Ok(format!("{c}")), // expr is not shown since it is aliased Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), - Expr::ScalarFunction(ScalarFunction { func, args }) => func.schema_name(args), + Expr::Column(c) => Ok(format!("{c}")), + Expr::IsNull(expr) => Ok(format!("{} IS NULL", expr.schema_name()?)), + Expr::IsNotNull(expr) => Ok(format!("{} IS NOT NULL", expr.schema_name()?)), + Expr::IsUnknown(expr) => Ok(format!("{} IS UNKNOWN", expr.schema_name()?)), + Expr::IsNotUnknown(expr) => { + Ok(format!("{} IS NOT UNKNOWN", expr.schema_name()?)) + } + Expr::IsTrue(expr) => Ok(format!("{} IS TRUE", expr.schema_name()?)), + Expr::IsFalse(expr) => Ok(format!("{} IS FALSE", expr.schema_name()?)), + Expr::IsNotTrue(expr) => Ok(format!("{} IS NOT TRUE", expr.schema_name()?)), + Expr::IsNotFalse(expr) => Ok(format!("{} IS NOT FALSE", expr.schema_name()?)), + Expr::Negative(expr) => Ok(format!("(- {})", expr.schema_name()?)), + Expr::Not(expr) => Ok(format!("NOT {}", expr.schema_name()?)), Expr::Unnest(Unnest { expr }) => { Ok(format!("unnest({})", expr.schema_name()?)) } + Expr::ScalarFunction(ScalarFunction { func, args }) => func.schema_name(args), // Most of the expr has no difference _ => self.display_name(), } @@ -2050,8 +2062,18 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { // Reuse Display trait Expr::Column(_) | Expr::Literal(_) - | Expr::Placeholder(_) + | Expr::IsFalse(_) + | Expr::IsNotFalse(_) + | Expr::IsTrue(_) + | Expr::IsNotTrue(_) + | Expr::IsUnknown(_) + | Expr::IsNotUnknown(_) + | Expr::IsNull(_) + | Expr::IsNotNull(_) + | Expr::Negative(_) + | Expr::Not(_) | Expr::OuterReferenceColumn(..) + | Expr::Placeholder(_) | Expr::ScalarVariable(..) | Expr::Wildcard { .. } => write!(w, "{e}")?, @@ -2130,47 +2152,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { // CAST does not change the expression name write_name(w, expr)?; } - Expr::Not(expr) => { - w.write_str("NOT ")?; - write_name(w, expr)?; - } - Expr::Negative(expr) => { - w.write_str("(- ")?; - write_name(w, expr)?; - w.write_str(")")?; - } - Expr::IsNull(expr) => { - write_name(w, expr)?; - w.write_str(" IS NULL")?; - } - Expr::IsNotNull(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT NULL")?; - } - Expr::IsTrue(expr) => { - write_name(w, expr)?; - w.write_str(" IS TRUE")?; - } - Expr::IsFalse(expr) => { - write_name(w, expr)?; - w.write_str(" IS FALSE")?; - } - Expr::IsUnknown(expr) => { - write_name(w, expr)?; - w.write_str(" IS UNKNOWN")?; - } - Expr::IsNotTrue(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT TRUE")?; - } - Expr::IsNotFalse(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT FALSE")?; - } - Expr::IsNotUnknown(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT UNKNOWN")?; - } Expr::Exists(Exists { negated: true, .. }) => w.write_str("NOT EXISTS")?, Expr::Exists(Exists { negated: false, .. }) => w.write_str("EXISTS")?, Expr::InSubquery(InSubquery { negated: true, .. }) => w.write_str("NOT IN")?, diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 7a144cb99380..1d95e55aaa44 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -40,7 +40,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; -use crate::utils::{get_arg_name, make_scalar_function}; +use crate::utils::make_scalar_function; // Create static instances of ScalarUDFs for each function make_udf_expr_and_func!( @@ -97,15 +97,24 @@ impl ScalarUDFImpl for ArrayElement { } fn display_name(&self, args: &[Expr]) -> Result { - self.schema_name(args) + let args_name = args.iter().map(ToString::to_string).collect::>(); + if args_name.len() != 2 { + return exec_err!("expect 2 args, got {}", args_name.len()); + } + + Ok(format!("{}[{}]", args_name[0], args_name[1])) } fn schema_name(&self, args: &[Expr]) -> Result { - Ok(format!( - "{}[{}]", - get_arg_name(args, 0), - get_arg_name(args, 1) - )) + let args_name = args + .iter() + .map(Expr::schema_name) + .collect::>>()?; + if args_name.len() != 2 { + return exec_err!("expect 2 args, got {}", args_name.len()); + } + + Ok(format!("{}[{}]", args_name[0], args_name[1])) } fn signature(&self) -> &Signature { @@ -258,18 +267,24 @@ impl ScalarUDFImpl for ArraySlice { } fn display_name(&self, args: &[Expr]) -> Result { - self.schema_name(args) + let args_name = args.iter().map(ToString::to_string).collect::>(); + if let Some((arr, indexes)) = args_name.split_first() { + Ok(format!("{arr}[{}]", indexes.join(":"))) + } else { + exec_err!("no argument") + } } fn schema_name(&self, args: &[Expr]) -> Result { - Ok(format!( - "{}[{}]", - get_arg_name(args, 0), - (1..args.len()) - .map(|i| get_arg_name(args, i)) - .collect::>() - .join(":") - )) + let args_name = args + .iter() + .map(Expr::schema_name) + .collect::>>()?; + if let Some((arr, indexes)) = args_name.split_first() { + Ok(format!("{arr}[{}]", indexes.join(":"))) + } else { + exec_err!("no argument") + } } fn name(&self) -> &str { diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs index f396c3b22581..688e1633e5cf 100644 --- a/datafusion/functions-nested/src/utils.rs +++ b/datafusion/functions-nested/src/utils.rs @@ -32,7 +32,7 @@ use datafusion_common::{exec_err, plan_err, Result, ScalarValue}; use core::any::type_name; use datafusion_common::DataFusionError; -use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionImplementation}; +use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; macro_rules! downcast_arg { ($ARG:expr, $ARRAY_TYPE:ident) => {{ @@ -253,11 +253,6 @@ pub(crate) fn compute_array_dims( } } -/// Returns the name of the argument at index `i`, or an empty string if the index is out of bounds. -pub(super) fn get_arg_name(args: &[Expr], i: usize) -> String { - args.get(i).map(ToString::to_string).unwrap_or_default() -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 9cfc24f63524..66caa092f103 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -58,7 +58,23 @@ impl ScalarUDFImpl for GetFieldFunc { } fn display_name(&self, args: &[Expr]) -> Result { - self.schema_name(args) + if args.len() != 2 { + return exec_err!( + "get_field function requires 2 arguments, got {}", + args.len() + ); + } + + let name = match &args[1] { + Expr::Literal(name) => name, + _ => { + return exec_err!( + "get_field function requires the argument field_name to be a string" + ); + } + }; + + Ok(format!("{}[{}]", args[0].display_name()?, name)) } fn schema_name(&self, args: &[Expr]) -> Result { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index f2972e4c14c2..4b04d476dc01 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1137,7 +1137,7 @@ from arrays_values_without_nulls; ## array_element (aliases: array_extract, list_extract, list_element) # Testing with empty arguments should result in an error -query error DataFusion error: Error during planning: Error during planning: array_element does not support zero arguments. +query error DataFusion error: Execution error: expect 2 args, got 0 select array_element(); # array_element error @@ -1979,7 +1979,7 @@ select array_slice(a, -1, 2, 1), array_slice(a, -1, 2), [6.0] [6.0] [] [] # Testing with empty arguments should result in an error -query error DataFusion error: Error during planning: Error during planning: array_slice does not support zero arguments. +query error DataFusion error: Execution error: no argument select array_slice(); From af49720179d31f317f08fe1430a36e175c668867 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:01:04 +0800 Subject: [PATCH 11/32] rename unnest Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 15 ++++++- datafusion/expr/src/udf.rs | 4 +- .../test_files/push_down_filter.slt | 40 +++++++++---------- datafusion/sqllogictest/test_files/unnest.slt | 18 ++++----- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 69e76998538e..2fa941f1addb 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1006,10 +1006,21 @@ impl Expr { } /// Returns the name for schema / field that is different from display + /// Most of the expressions are the same as display_name + /// Those are the main difference + /// 1. Alias + /// 2. Cast pub fn schema_name(&self) -> Result { match self { // expr is not shown since it is aliased Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), + Expr::Between(Between { expr, negated, low, high }) => { + if *negated { + Ok(format!("{} NOT BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()?)) + } else { + Ok(format!("{} BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()?)) + } + } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), Expr::Column(c) => Ok(format!("{c}")), @@ -1026,10 +1037,10 @@ impl Expr { Expr::Negative(expr) => Ok(format!("(- {})", expr.schema_name()?)), Expr::Not(expr) => Ok(format!("NOT {}", expr.schema_name()?)), Expr::Unnest(Unnest { expr }) => { - Ok(format!("unnest({})", expr.schema_name()?)) + Ok(format!("UNNEST({})", expr.schema_name()?)) } Expr::ScalarFunction(ScalarFunction { func, args }) => func.schema_name(args), - // Most of the expr has no difference + // other exprs has no difference _ => self.display_name(), } } diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 04a9c7bbbf9f..8cdb9a9dfcb6 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -354,7 +354,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Returns the user-defined display name of the UDF given the arguments fn display_name(&self, args: &[Expr]) -> Result { let names: Vec = args.iter().map(create_name).collect::>()?; - // TODO: join with ", " to standardize the formatting of Vec + // TODO: join with ", " to standardize the formatting of Vec, Ok(format!("{}({})", self.name(), names.join(","))) } @@ -364,7 +364,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { .iter() .map(Expr::schema_name) .collect::>>()?; - // TODO: join with ", " to standardize the formatting of Vec + // TODO: join with ", " to standardize the formatting of Vec, Ok(format!("{}({})", self.name(), args_name.join(","))) } diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index 3ca187ddee84..2d74c1fc6994 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -36,9 +36,9 @@ query TT explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2 -02)--Unnest: lists[unnest(v.column2)] structs[] -03)----Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2 +02)--Unnest: lists[UNNEST(v.column2)] structs[] +03)----Projection: v.column2 AS UNNEST(v.column2), v.column1 04)------Filter: v.column1 = Int64(2) 05)--------TableScan: v projection=[column1, column2] @@ -53,11 +53,11 @@ query TT explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2 -02)--Filter: unnest(v.column2) > Int64(3) -03)----Projection: unnest(v.column2) -04)------Unnest: lists[unnest(v.column2)] structs[] -05)--------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2 +02)--Filter: UNNEST(v.column2) > Int64(3) +03)----Projection: UNNEST(v.column2) +04)------Unnest: lists[UNNEST(v.column2)] structs[] +05)--------Projection: v.column2 AS UNNEST(v.column2), v.column1 06)----------TableScan: v projection=[column1, column2] query II @@ -71,10 +71,10 @@ query TT explain select uc2, column1 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2, v.column1 -02)--Filter: unnest(v.column2) > Int64(3) -03)----Unnest: lists[unnest(v.column2)] structs[] -04)------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2, v.column1 +02)--Filter: UNNEST(v.column2) > Int64(3) +03)----Unnest: lists[UNNEST(v.column2)] structs[] +04)------Projection: v.column2 AS UNNEST(v.column2), v.column1 05)--------Filter: v.column1 = Int64(2) 06)----------TableScan: v projection=[column1, column2] @@ -90,10 +90,10 @@ query TT explain select uc2, column1 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2, v.column1 -02)--Filter: unnest(v.column2) > Int64(3) OR v.column1 = Int64(2) -03)----Unnest: lists[unnest(v.column2)] structs[] -04)------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2, v.column1 +02)--Filter: UNNEST(v.column2) > Int64(3) OR v.column1 = Int64(2) +03)----Unnest: lists[UNNEST(v.column2)] structs[] +04)------Projection: v.column2 AS UNNEST(v.column2), v.column1 05)--------TableScan: v projection=[column1, column2] statement ok @@ -112,10 +112,10 @@ query TT explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1; ---- logical_plan -01)Projection: d.column1, unnest(d.column2) AS o -02)--Filter: get_field(unnest(d.column2), Utf8("a")) = Int64(1) -03)----Unnest: lists[unnest(d.column2)] structs[] -04)------Projection: d.column1, d.column2 AS unnest(d.column2) +01)Projection: d.column1, UNNEST(d.column2) AS o +02)--Filter: get_field(UNNEST(d.column2), Utf8("a")) = Int64(1) +03)----Unnest: lists[UNNEST(d.column2)] structs[] +04)------Projection: d.column1, d.column2 AS UNNEST(d.column2) 05)--------TableScan: d projection=[column1, column2] diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index d818c0e92795..4957011b8ba2 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -539,21 +539,21 @@ query TT explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unnest_table; ---- logical_plan -01)Unnest: lists[unnest(unnest(unnest(recursive_unnest_table.column3)[c1]))] structs[] -02)--Projection: unnest(unnest(recursive_unnest_table.column3)[c1]) AS unnest(unnest(unnest(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 -03)----Unnest: lists[unnest(unnest(recursive_unnest_table.column3)[c1])] structs[] -04)------Projection: get_field(unnest(recursive_unnest_table.column3), Utf8("c1")) AS unnest(unnest(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 -05)--------Unnest: lists[unnest(recursive_unnest_table.column3)] structs[] -06)----------Projection: recursive_unnest_table.column3 AS unnest(recursive_unnest_table.column3), recursive_unnest_table.column3 +01)Unnest: lists[UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1]))] structs[] +02)--Projection: UNNEST(UNNEST(recursive_unnest_table.column3)[c1]) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 +03)----Unnest: lists[UNNEST(UNNEST(recursive_unnest_table.column3)[c1])] structs[] +04)------Projection: get_field(UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS UNNEST(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +05)--------Unnest: lists[UNNEST(recursive_unnest_table.column3)] structs[] +06)----------Projection: recursive_unnest_table.column3 AS UNNEST(recursive_unnest_table.column3), recursive_unnest_table.column3 07)------------TableScan: recursive_unnest_table projection=[column3] physical_plan 01)UnnestExec -02)--ProjectionExec: expr=[unnest(unnest(recursive_unnest_table.column3)[c1])@0 as unnest(unnest(unnest(recursive_unnest_table.column3)[c1])), column3@1 as column3] +02)--ProjectionExec: expr=[UNNEST(UNNEST(recursive_unnest_table.column3)[c1])@0 as UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), column3@1 as column3] 03)----UnnestExec -04)------ProjectionExec: expr=[get_field(unnest(recursive_unnest_table.column3)@0, c1) as unnest(unnest(recursive_unnest_table.column3)[c1]), column3@1 as column3] +04)------ProjectionExec: expr=[get_field(UNNEST(recursive_unnest_table.column3)@0, c1) as UNNEST(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 06)----------UnnestExec -07)------------ProjectionExec: expr=[column3@0 as unnest(recursive_unnest_table.column3), column3@0 as column3] +07)------------ProjectionExec: expr=[column3@0 as UNNEST(recursive_unnest_table.column3), column3@0 as column3] 08)--------------MemoryExec: partitions=1, partition_sizes=[1] ## group by unnest From 228453578a555f8a9169312a02dcb83299cfb354 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:02:52 +0800 Subject: [PATCH 12/32] rm column Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 2fa941f1addb..a9bd2013d514 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1014,16 +1014,30 @@ impl Expr { match self { // expr is not shown since it is aliased Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), - Expr::Between(Between { expr, negated, low, high }) => { + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { if *negated { - Ok(format!("{} NOT BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()?)) + Ok(format!( + "{} NOT BETWEEN {} AND {}", + expr.schema_name()?, + low.schema_name()?, + high.schema_name()? + )) } else { - Ok(format!("{} BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()?)) + Ok(format!( + "{} BETWEEN {} AND {}", + expr.schema_name()?, + low.schema_name()?, + high.schema_name()? + )) } } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), - Expr::Column(c) => Ok(format!("{c}")), Expr::IsNull(expr) => Ok(format!("{} IS NULL", expr.schema_name()?)), Expr::IsNotNull(expr) => Ok(format!("{} IS NOT NULL", expr.schema_name()?)), Expr::IsUnknown(expr) => Ok(format!("{} IS UNKNOWN", expr.schema_name()?)), From 2a8350efa81893c287a7b055ca74be359a173c7e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:10:20 +0800 Subject: [PATCH 13/32] inlis Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index a9bd2013d514..55a8e37ca4f0 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1038,6 +1038,16 @@ impl Expr { } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), + Expr::InList(InList { expr, list, negated }) => { + let inlist_exprs = list.iter().map(Self::schema_name).collect::>>()?; + let inlist_name = inlist_exprs.join(", "); + + if *negated { + Ok(format!("{} NOT IN {}", expr.schema_name()?, inlist_name)) + } else { + Ok(format!("{} IN {}", expr.schema_name()?, inlist_name)) + } + } Expr::IsNull(expr) => Ok(format!("{} IS NULL", expr.schema_name()?)), Expr::IsNotNull(expr) => Ok(format!("{} IS NOT NULL", expr.schema_name()?)), Expr::IsUnknown(expr) => Ok(format!("{} IS UNKNOWN", expr.schema_name()?)), From 4ddf57e01dfd5a3b1192f408e098f2e313e19a0d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:10:29 +0800 Subject: [PATCH 14/32] fmt Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 55a8e37ca4f0..c211537fbdc2 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1038,8 +1038,15 @@ impl Expr { } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), - Expr::InList(InList { expr, list, negated }) => { - let inlist_exprs = list.iter().map(Self::schema_name).collect::>>()?; + Expr::InList(InList { + expr, + list, + negated, + }) => { + let inlist_exprs = list + .iter() + .map(Self::schema_name) + .collect::>>()?; let inlist_name = inlist_exprs.join(", "); if *negated { From 7ba77cacb1952337af86eb3d03096f5bf4feb052 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:40:50 +0800 Subject: [PATCH 15/32] udaf Signed-off-by: jayzhan211 --- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr.rs | 74 +++++++++++++++++-------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 44b2e6b2d6b7..18060fdd79db 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1887,7 +1887,7 @@ pub fn create_aggregate_expr_and_maybe_filter( // unpack (nested) aliased logical expressions, e.g. "sum(col) as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()), - Expr::AggregateFunction(_) => (e.display_name().unwrap_or(physical_name(e)?), e), + Expr::AggregateFunction(_) => (e.schema_name().unwrap_or(physical_name(e)?), e), _ => (physical_name(e)?, e), }; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index c211537fbdc2..69cda70d04a5 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -38,7 +38,7 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; use datafusion_common::{ - internal_err, plan_err, Column, DFSchema, Result, ScalarValue, TableReference, + plan_err, Column, DFSchema, Result, ScalarValue, TableReference, }; use sqlparser::ast::NullTreatment; @@ -1012,6 +1012,51 @@ impl Expr { /// 2. Cast pub fn schema_name(&self) -> Result { match self { + Expr::AggregateFunction(AggregateFunction { + func_def, + args, + distinct, + filter, + order_by, + null_treatment, + }) => { + let mut s = String::new(); + + let args_name = args + .iter() + .map(Self::schema_name) + .collect::>>()?; + // TODO: join with ", " to standardize the formatting of Vec, + if *distinct { + write!( + &mut s, + "{}(DISTINCT {})", + func_def.name(), + args_name.join(",") + )?; + } else { + write!(&mut s, "{}({})", func_def.name(), args_name.join(","))?; + } + + if let Some(null_treatment) = null_treatment { + write!(&mut s, " {}", null_treatment)?; + } + + if let Some(filter) = filter { + write!(&mut s, " FILTER (WHERE {})", filter.schema_name()?)?; + }; + + if let Some(order_by) = order_by { + let order_by_name = order_by + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, " ORDER BY [{}]", order_by_name)?; + }; + + Ok(s) + } // expr is not shown since it is aliased Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), Expr::Between(Between { @@ -1038,6 +1083,7 @@ impl Expr { } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), + Expr::Column(c) => Ok(format!("{c}")), Expr::InList(InList { expr, list, @@ -2102,7 +2148,8 @@ pub(crate) fn create_name(e: &Expr) -> Result { fn write_name(w: &mut W, e: &Expr) -> Result<()> { match e { // Reuse Display trait - Expr::Column(_) + Expr::AggregateFunction(_) + | Expr::Column(_) | Expr::Literal(_) | Expr::IsFalse(_) | Expr::IsNotFalse(_) @@ -2117,6 +2164,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::OuterReferenceColumn(..) | Expr::Placeholder(_) | Expr::ScalarVariable(..) + | Expr::Sort(..) | Expr::Wildcard { .. } => write!(w, "{e}")?, Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, @@ -2234,25 +2282,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { w.write_str(" ")?; write!(w, "{window_frame}")?; } - Expr::AggregateFunction(AggregateFunction { - func_def, - distinct, - args, - filter, - order_by, - null_treatment, - }) => { - write_function_name(w, func_def.name(), *distinct, args)?; - if let Some(fe) = filter { - write!(w, " FILTER (WHERE {fe})")?; - }; - if let Some(order_by) = order_by { - write!(w, " ORDER BY [{}]", expr_vec_fmt!(order_by))?; - }; - if let Some(nt) = null_treatment { - write!(w, " {}", nt)?; - } - } Expr::GroupingSet(grouping_set) => match grouping_set { GroupingSet::Rollup(exprs) => { write!(w, "ROLLUP (")?; @@ -2306,9 +2335,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { write!(w, " AND ")?; write_name(w, high)?; } - Expr::Sort { .. } => { - return internal_err!("Create name does not support sort expression") - } }; Ok(()) } From 04dd18b408f70052951138ff52fa3dfb5609bc53 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 15:55:34 +0800 Subject: [PATCH 16/32] case Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 71 ++++++++++++------------------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 69cda70d04a5..edaf75a45fcb 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1081,6 +1081,25 @@ impl Expr { )) } } + Expr::Case(Case { expr, when_then_expr, else_expr }) => { + let mut s = String::new(); + write!(&mut s, "CASE ")?; + + if let Some(e) = expr { + write!(&mut s, "{} ", e.schema_name()?)?; + } + + for (when, then) in when_then_expr { + write!(&mut s, "WHEN {} THEN {} ", when.schema_name()?, then.schema_name()?)?; + } + + if let Some(e) = else_expr { + write!(&mut s, "ELSE {} ", e.schema_name()?)?; + } + + write!(&mut s, "END")?; + Ok(s) + } // cast expr is not shown to be consistant with Postgres and Spark Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), Expr::Column(c) => Ok(format!("{c}")), @@ -2149,8 +2168,11 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { match e { // Reuse Display trait Expr::AggregateFunction(_) + | Expr::Between(_) + | Expr::Case(_) | Expr::Column(_) | Expr::Literal(_) + | Expr::InList(_) | Expr::IsFalse(_) | Expr::IsNotFalse(_) | Expr::IsTrue(_) @@ -2214,26 +2236,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { write!(w, " CHAR '{char}'")?; } } - Expr::Case(case) => { - write!(w, "CASE ")?; - if let Some(e) = &case.expr { - write_name(w, e)?; - w.write_str(" ")?; - } - for (when, then) in &case.when_then_expr { - w.write_str("WHEN ")?; - write_name(w, when)?; - w.write_str(" THEN ")?; - write_name(w, then)?; - w.write_str(" ")?; - } - if let Some(e) = &case.else_expr { - w.write_str("ELSE ")?; - write_name(w, e)?; - w.write_str(" ")?; - } - w.write_str("END")?; - } Expr::Cast(Cast { expr, .. }) => { // CAST does not change the expression name write_name(w, expr)?; @@ -2306,35 +2308,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { write!(w, ")")?; } }, - Expr::InList(InList { - expr, - list, - negated, - }) => { - write_name(w, expr)?; - let list = list.iter().map(create_name); - if *negated { - write!(w, " NOT IN ({list:?})")?; - } else { - write!(w, " IN ({list:?})")?; - } - } - Expr::Between(Between { - expr, - negated, - low, - high, - }) => { - write_name(w, expr)?; - if *negated { - write!(w, " NOT BETWEEN ")?; - } else { - write!(w, " BETWEEN ")?; - } - write_name(w, low)?; - write!(w, " AND ")?; - write_name(w, high)?; - } }; Ok(()) } From a39212e6320444b7d4d8d1daa8d6a767d0502763 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 16:06:37 +0800 Subject: [PATCH 17/32] use write Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 83 +++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index edaf75a45fcb..ed744f288bcb 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1011,6 +1011,8 @@ impl Expr { /// 1. Alias /// 2. Cast pub fn schema_name(&self) -> Result { + let mut s = String::new(); + match self { Expr::AggregateFunction(AggregateFunction { func_def, @@ -1020,8 +1022,6 @@ impl Expr { order_by, null_treatment, }) => { - let mut s = String::new(); - let args_name = args .iter() .map(Self::schema_name) @@ -1054,11 +1054,9 @@ impl Expr { .join(", "); write!(&mut s, " ORDER BY [{}]", order_by_name)?; }; - - Ok(s) } // expr is not shown since it is aliased - Expr::Alias(Alias { name, .. }) => Ok(name.to_owned()), + Expr::Alias(Alias { name, .. }) => write!(&mut s, "{name}")?, Expr::Between(Between { expr, negated, @@ -1066,23 +1064,28 @@ impl Expr { high, }) => { if *negated { - Ok(format!( + write!( + &mut s, "{} NOT BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()? - )) + )?; } else { - Ok(format!( + write!( + &mut s, "{} BETWEEN {} AND {}", expr.schema_name()?, low.schema_name()?, high.schema_name()? - )) + )?; } } - Expr::Case(Case { expr, when_then_expr, else_expr }) => { - let mut s = String::new(); + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { write!(&mut s, "CASE ")?; if let Some(e) = expr { @@ -1090,7 +1093,12 @@ impl Expr { } for (when, then) in when_then_expr { - write!(&mut s, "WHEN {} THEN {} ", when.schema_name()?, then.schema_name()?)?; + write!( + &mut s, + "WHEN {} THEN {} ", + when.schema_name()?, + then.schema_name()? + )?; } if let Some(e) = else_expr { @@ -1098,11 +1106,12 @@ impl Expr { } write!(&mut s, "END")?; - Ok(s) } // cast expr is not shown to be consistant with Postgres and Spark - Expr::Cast(Cast { expr, data_type: _ }) => expr.schema_name(), - Expr::Column(c) => Ok(format!("{c}")), + Expr::Cast(Cast { expr, data_type: _ }) => { + write!(&mut s, "{}", expr.schema_name()?)? + } + Expr::Column(c) => write!(&mut s, "{c}")?, Expr::InList(InList { expr, list, @@ -1115,30 +1124,42 @@ impl Expr { let inlist_name = inlist_exprs.join(", "); if *negated { - Ok(format!("{} NOT IN {}", expr.schema_name()?, inlist_name)) + write!(&mut s, "{} NOT IN {}", expr.schema_name()?, inlist_name)?; } else { - Ok(format!("{} IN {}", expr.schema_name()?, inlist_name)) + write!(&mut s, "{} IN {}", expr.schema_name()?, inlist_name)?; } } - Expr::IsNull(expr) => Ok(format!("{} IS NULL", expr.schema_name()?)), - Expr::IsNotNull(expr) => Ok(format!("{} IS NOT NULL", expr.schema_name()?)), - Expr::IsUnknown(expr) => Ok(format!("{} IS UNKNOWN", expr.schema_name()?)), + Expr::IsNull(expr) => write!(&mut s, "{} IS NULL", expr.schema_name()?)?, + Expr::IsNotNull(expr) => { + write!(&mut s, "{} IS NOT NULL", expr.schema_name()?)? + } + Expr::IsUnknown(expr) => { + write!(&mut s, "{} IS UNKNOWN", expr.schema_name()?)? + } Expr::IsNotUnknown(expr) => { - Ok(format!("{} IS NOT UNKNOWN", expr.schema_name()?)) - } - Expr::IsTrue(expr) => Ok(format!("{} IS TRUE", expr.schema_name()?)), - Expr::IsFalse(expr) => Ok(format!("{} IS FALSE", expr.schema_name()?)), - Expr::IsNotTrue(expr) => Ok(format!("{} IS NOT TRUE", expr.schema_name()?)), - Expr::IsNotFalse(expr) => Ok(format!("{} IS NOT FALSE", expr.schema_name()?)), - Expr::Negative(expr) => Ok(format!("(- {})", expr.schema_name()?)), - Expr::Not(expr) => Ok(format!("NOT {}", expr.schema_name()?)), + write!(&mut s, "{} IS NOT UNKNOWN", expr.schema_name()?)? + } + Expr::IsTrue(expr) => write!(&mut s, "{} IS TRUE", expr.schema_name()?)?, + Expr::IsFalse(expr) => write!(&mut s, "{} IS FALSE", expr.schema_name()?)?, + Expr::IsNotTrue(expr) => { + write!(&mut s, "{} IS NOT TRUE", expr.schema_name()?)? + } + Expr::IsNotFalse(expr) => { + write!(&mut s, "{} IS NOT FALSE", expr.schema_name()?)? + } + Expr::Negative(expr) => write!(&mut s, "(- {})", expr.schema_name()?)?, + Expr::Not(expr) => write!(&mut s, "NOT {}", expr.schema_name()?)?, Expr::Unnest(Unnest { expr }) => { - Ok(format!("UNNEST({})", expr.schema_name()?)) + write!(&mut s, "UNNEST({})", expr.schema_name()?)? + } + Expr::ScalarFunction(ScalarFunction { func, args }) => { + write!(&mut s, "{}", func.schema_name(args)?)? } - Expr::ScalarFunction(ScalarFunction { func, args }) => func.schema_name(args), // other exprs has no difference - _ => self.display_name(), + _ => write!(&mut s, "{}", self.display_name()?)?, } + + Ok(s) } /// Returns a full and complete string representation of this expression. From 7384bb814dff3878ed6f4bf6778b2630445871a6 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 16:28:51 +0800 Subject: [PATCH 18/32] fix test Signed-off-by: jayzhan211 --- .../optimizer/src/analyzer/type_coercion.rs | 8 ++----- datafusion/sql/src/utils.rs | 22 +++++++++---------- datafusion/sql/tests/cases/plan_to_sql.rs | 8 +++---- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index bcd1cbcce23e..57a5bcd2b791 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1042,9 +1042,7 @@ mod test { let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); let empty = empty_with_type(DataType::Int64); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - let expected = - "Projection: a IN ([CAST(Int32(1) AS Int64), CAST(Int8(4) AS Int64), Int64(8)]) AS a IN (Map { iter: Iter([Literal(Int32(1)), Literal(Int8(4)), Literal(Int64(8))]) })\ - \n EmptyRelation"; + let expected = "Projection: a IN ([CAST(Int32(1) AS Int64), CAST(Int8(4) AS Int64), Int64(8)])\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; // a in (1,4,8), a is decimal @@ -1057,9 +1055,7 @@ mod test { )?), })); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - let expected = - "Projection: CAST(a AS Decimal128(24, 4)) IN ([CAST(Int32(1) AS Decimal128(24, 4)), CAST(Int8(4) AS Decimal128(24, 4)), CAST(Int64(8) AS Decimal128(24, 4))]) AS a IN (Map { iter: Iter([Literal(Int32(1)), Literal(Int8(4)), Literal(Int64(8))]) })\ - \n EmptyRelation"; + let expected = "Projection: CAST(a AS Decimal128(24, 4)) IN ([CAST(Int32(1) AS Decimal128(24, 4)), CAST(Int8(4) AS Decimal128(24, 4)), CAST(Int64(8) AS Decimal128(24, 4))])\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected) } diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index eef8763024a1..0356727720cd 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -469,16 +469,16 @@ mod tests { assert_eq!( transformed_exprs, vec![ - col("unnest(struct_col).field1"), - col("unnest(struct_col).field2"), + col("UNNEST(struct_col).field1"), + col("UNNEST(struct_col).field2"), ] ); - assert_eq!(unnest_placeholder_columns, vec!["unnest(struct_col)"]); + assert_eq!(unnest_placeholder_columns, vec!["UNNEST(struct_col)"]); // still reference struct_col in original schema but with alias, // to avoid colliding with the projection on the column itself if any assert_eq!( inner_projection_exprs, - vec![col("struct_col").alias("unnest(struct_col)"),] + vec![col("struct_col").alias("UNNEST(struct_col)"),] ); // unnest(array_col) + 1 @@ -491,12 +491,12 @@ mod tests { )?; assert_eq!( unnest_placeholder_columns, - vec!["unnest(struct_col)", "unnest(array_col)"] + vec!["UNNEST(struct_col)", "UNNEST(array_col)"] ); // only transform the unnest children assert_eq!( transformed_exprs, - vec![col("unnest(array_col)").add(lit(1i64))] + vec![col("UNNEST(array_col)").add(lit(1i64))] ); // keep appending to the current vector @@ -505,8 +505,8 @@ mod tests { assert_eq!( inner_projection_exprs, vec![ - col("struct_col").alias("unnest(struct_col)"), - col("array_col").alias("unnest(array_col)") + col("struct_col").alias("UNNEST(struct_col)"), + col("array_col").alias("UNNEST(array_col)") ] ); @@ -553,17 +553,17 @@ mod tests { // Only the inner most/ bottom most unnest is transformed assert_eq!( transformed_exprs, - vec![unnest(col("unnest(struct_col[matrix])"))] + vec![unnest(col("UNNEST(struct_col[matrix])"))] ); assert_eq!( unnest_placeholder_columns, - vec!["unnest(struct_col[matrix])"] + vec!["UNNEST(struct_col[matrix])"] ); assert_eq!( inner_projection_exprs, vec![col("struct_col") .field("matrix") - .alias("unnest(struct_col[matrix])"),] + .alias("UNNEST(struct_col[matrix])"),] ); Ok(()) diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index bae3ec2e2779..1f0726eb47e6 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -415,10 +415,10 @@ fn test_unnest_logical_plan() -> Result<()> { let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); - let expected = "Projection: unnest(unnest_table.struct_col).field1, unnest(unnest_table.struct_col).field2, unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ - \n Unnest: lists[unnest(unnest_table.array_col)] structs[unnest(unnest_table.struct_col)]\ - \n Projection: unnest_table.struct_col AS unnest(unnest_table.struct_col), unnest_table.array_col AS unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ - \n TableScan: unnest_table"; + let expected = "Projection: UNNEST(unnest_table.struct_col).field1, UNNEST(unnest_table.struct_col).field2, UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n Unnest: lists[UNNEST(unnest_table.array_col)] structs[UNNEST(unnest_table.struct_col)]\ + \n Projection: unnest_table.struct_col AS UNNEST(unnest_table.struct_col), unnest_table.array_col AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n TableScan: unnest_table"; assert_eq!(format!("{plan:?}"), expected); From 9f82bac1c96e3b3d7ffc65fee6acfe7b3ca5f27e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 16:44:32 +0800 Subject: [PATCH 19/32] fix window Signed-off-by: jayzhan211 --- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr.rs | 35 ++++++------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 18060fdd79db..278c13eaa7b6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1794,7 +1794,7 @@ pub fn create_window_expr( // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()), - _ => (e.display_name()?, e), + _ => (e.schema_name()?, e), }; create_window_expr_with_name(e, name, logical_schema, execution_props) } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ed744f288bcb..a2d899bcb3ac 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1081,6 +1081,9 @@ impl Expr { )?; } } + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + write!(&mut s, "{} {op} {}", left.schema_name()?, right.schema_name()?)?; + } Expr::Case(Case { expr, when_then_expr, @@ -1112,23 +1115,8 @@ impl Expr { write!(&mut s, "{}", expr.schema_name()?)? } Expr::Column(c) => write!(&mut s, "{c}")?, - Expr::InList(InList { - expr, - list, - negated, - }) => { - let inlist_exprs = list - .iter() - .map(Self::schema_name) - .collect::>>()?; - let inlist_name = inlist_exprs.join(", "); - - if *negated { - write!(&mut s, "{} NOT IN {}", expr.schema_name()?, inlist_name)?; - } else { - write!(&mut s, "{} IN {}", expr.schema_name()?, inlist_name)?; - } - } + Expr::Exists(Exists { negated: true, .. }) => write!(&mut s, "NOT EXISTS")?, + Expr::Exists(Exists { negated: false, .. }) => write!(&mut s, "EXISTS")?, Expr::IsNull(expr) => write!(&mut s, "{} IS NULL", expr.schema_name()?)?, Expr::IsNotNull(expr) => { write!(&mut s, "{} IS NOT NULL", expr.schema_name()?)? @@ -2137,6 +2125,7 @@ impl fmt::Display for Expr { }, Expr::Placeholder(Placeholder { id, .. }) => write!(f, "{id}"), Expr::Unnest(Unnest { expr }) => { + // TODO: use Display instead of Debug write!(f, "UNNEST({expr:?})") } } @@ -2190,6 +2179,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { // Reuse Display trait Expr::AggregateFunction(_) | Expr::Between(_) + | Expr::BinaryExpr(_) | Expr::Case(_) | Expr::Column(_) | Expr::Literal(_) @@ -2206,16 +2196,12 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::Not(_) | Expr::OuterReferenceColumn(..) | Expr::Placeholder(_) + | Expr::Unnest(_) | Expr::ScalarVariable(..) | Expr::Sort(..) | Expr::Wildcard { .. } => write!(w, "{e}")?, Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, - Expr::BinaryExpr(binary_expr) => { - write_name(w, binary_expr.left.as_ref())?; - write!(w, " {} ", binary_expr.op)?; - write_name(w, binary_expr.right.as_ref())?; - } Expr::Like(Like { negated, expr, @@ -2272,11 +2258,6 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { Expr::ScalarSubquery(subquery) => { w.write_str(subquery.subquery.schema().field(0).name().as_str())?; } - Expr::Unnest(Unnest { expr }) => { - w.write_str("unnest(")?; - write_name(w, expr)?; - w.write_str(")")?; - } Expr::ScalarFunction(fun) => { w.write_str(fun.func.display_name(&fun.args)?.as_str())?; } From da2a1f1c5f9b2de6ec9ca5653d7298c4610671a0 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 17:16:09 +0800 Subject: [PATCH 20/32] window Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 191 ++++++++++++++++++++++++------------ 1 file changed, 128 insertions(+), 63 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index a2d899bcb3ac..6eefb318d271 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1082,7 +1082,12 @@ impl Expr { } } Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - write!(&mut s, "{} {op} {}", left.schema_name()?, right.schema_name()?)?; + write!( + &mut s, + "{} {op} {}", + left.schema_name()?, + right.schema_name()? + )?; } Expr::Case(Case { expr, @@ -1115,8 +1120,53 @@ impl Expr { write!(&mut s, "{}", expr.schema_name()?)? } Expr::Column(c) => write!(&mut s, "{c}")?, + Expr::InList(InList { + expr, + list, + negated, + }) => { + let inlist_exprs = list + .iter() + .map(Self::schema_name) + .collect::>>()?; + let inlist_name = inlist_exprs.join(", "); + + if *negated { + write!(&mut s, "{} NOT IN {}", expr.schema_name()?, inlist_name)?; + } else { + write!(&mut s, "{} IN {}", expr.schema_name()?, inlist_name)?; + } + } Expr::Exists(Exists { negated: true, .. }) => write!(&mut s, "NOT EXISTS")?, Expr::Exists(Exists { negated: false, .. }) => write!(&mut s, "EXISTS")?, + Expr::GroupingSet(GroupingSet::Cube(exprs)) => { + let exprs_name = exprs + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, "ROLLUP ({})", exprs_name)?; + } + Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { + write!(&mut s, "GROUPING SETS (")?; + for (_, exprs) in lists_of_exprs.iter().enumerate() { + let exprs_name = exprs + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, "({})", exprs_name)?; + } + write!(&mut s, ")")?; + } + Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { + let exprs_name = exprs + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, "ROLLUP ({})", exprs_name)?; + } Expr::IsNull(expr) => write!(&mut s, "{} IS NULL", expr.schema_name()?)?, Expr::IsNotNull(expr) => { write!(&mut s, "{} IS NOT NULL", expr.schema_name()?)? @@ -1127,6 +1177,10 @@ impl Expr { Expr::IsNotUnknown(expr) => { write!(&mut s, "{} IS NOT UNKNOWN", expr.schema_name()?)? } + Expr::InSubquery(InSubquery { negated: true, .. }) => { + write!(&mut s, "NOT IN")? + } + Expr::InSubquery(InSubquery { negated: false, .. }) => write!(&mut s, "IN")?, Expr::IsTrue(expr) => write!(&mut s, "{} IS TRUE", expr.schema_name()?)?, Expr::IsFalse(expr) => write!(&mut s, "{} IS FALSE", expr.schema_name()?)?, Expr::IsNotTrue(expr) => { @@ -1143,6 +1197,48 @@ impl Expr { Expr::ScalarFunction(ScalarFunction { func, args }) => { write!(&mut s, "{}", func.schema_name(args)?)? } + Expr::ScalarSubquery(Subquery { subquery, .. }) => { + write!(&mut s, "{}", subquery.schema().field(0).name())?; + } + Expr::WindowFunction(WindowFunction { + fun, + args, + partition_by, + order_by, + window_frame, + null_treatment, + }) => { + let args_name = args + .iter() + .map(Self::schema_name) + .collect::>>()?; + // TODO: join with ", " to standardize the formatting of Vec, + write!(&mut s, "{}({})", fun, args_name.join(","))?; + + if let Some(null_treatment) = null_treatment { + write!(&mut s, " {}", null_treatment)?; + } + + if !partition_by.is_empty() { + let partition_by_name = partition_by + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, " PARTITION BY [{}]", partition_by_name)?; + } + + if !order_by.is_empty() { + let order_by_name = order_by + .iter() + .map(Self::schema_name) + .collect::>>()? + .join(", "); + write!(&mut s, " ORDER BY [{}]", order_by_name)?; + }; + + write!(&mut s, " {window_frame}")?; + } // other exprs has no difference _ => write!(&mut s, "{}", self.display_name()?)?, } @@ -2182,6 +2278,8 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::BinaryExpr(_) | Expr::Case(_) | Expr::Column(_) + | Expr::Exists(_) + | Expr::GroupingSet(_) | Expr::Literal(_) | Expr::InList(_) | Expr::IsFalse(_) @@ -2192,14 +2290,17 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::IsNotUnknown(_) | Expr::IsNull(_) | Expr::IsNotNull(_) + | Expr::InSubquery(_) | Expr::Negative(_) | Expr::Not(_) | Expr::OuterReferenceColumn(..) | Expr::Placeholder(_) | Expr::Unnest(_) + | Expr::ScalarSubquery(..) | Expr::ScalarVariable(..) | Expr::Sort(..) - | Expr::Wildcard { .. } => write!(w, "{e}")?, + | Expr::Wildcard { .. } + | Expr::WindowFunction(_) => write!(w, "{e}")?, Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, Expr::Like(Like { @@ -2251,73 +2352,37 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { // CAST does not change the expression name write_name(w, expr)?; } - Expr::Exists(Exists { negated: true, .. }) => w.write_str("NOT EXISTS")?, - Expr::Exists(Exists { negated: false, .. }) => w.write_str("EXISTS")?, - Expr::InSubquery(InSubquery { negated: true, .. }) => w.write_str("NOT IN")?, - Expr::InSubquery(InSubquery { negated: false, .. }) => w.write_str("IN")?, - Expr::ScalarSubquery(subquery) => { - w.write_str(subquery.subquery.schema().field(0).name().as_str())?; - } Expr::ScalarFunction(fun) => { w.write_str(fun.func.display_name(&fun.args)?.as_str())?; - } - Expr::WindowFunction(WindowFunction { - fun, - args, - window_frame, - partition_by, - order_by, - null_treatment, - }) => { - write_function_name(w, &fun.to_string(), false, args)?; - - if let Some(nt) = null_treatment { - w.write_str(" ")?; - write!(w, "{}", nt)?; - } - if !partition_by.is_empty() { - w.write_str(" ")?; - write!(w, "PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; - } - if !order_by.is_empty() { - w.write_str(" ")?; - write!(w, "ORDER BY [{}]", expr_vec_fmt!(order_by))?; - } - w.write_str(" ")?; - write!(w, "{window_frame}")?; - } - Expr::GroupingSet(grouping_set) => match grouping_set { - GroupingSet::Rollup(exprs) => { - write!(w, "ROLLUP (")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - GroupingSet::Cube(exprs) => { - write!(w, "CUBE (")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - GroupingSet::GroupingSets(lists_of_exprs) => { - write!(w, "GROUPING SETS (")?; - for (i, exprs) in lists_of_exprs.iter().enumerate() { - if i != 0 { - write!(w, ", ")?; - } - write!(w, "(")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - write!(w, ")")?; - } - }, + } // Expr::WindowFunction(WindowFunction { + // fun, + // args, + // window_frame, + // partition_by, + // order_by, + // null_treatment, + // }) => { + // write_function_name(w, &fun.to_string(), false, args)?; + + // if let Some(nt) = null_treatment { + // w.write_str(" ")?; + // write!(w, "{}", nt)?; + // } + // if !partition_by.is_empty() { + // w.write_str(" ")?; + // write!(w, "PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; + // } + // if !order_by.is_empty() { + // w.write_str(" ")?; + // write!(w, "ORDER BY [{}]", expr_vec_fmt!(order_by))?; + // } + // w.write_str(" ")?; + // write!(w, "{window_frame}")?; + // } }; Ok(()) } -fn write_names(w: &mut W, exprs: &[Expr]) -> Result<()> { - exprs.iter().try_for_each(|e| write_name(w, e)) -} - fn write_names_join(w: &mut W, exprs: &[Expr], sep: &str) -> Result<()> { let mut iter = exprs.iter(); if let Some(first_arg) = iter.next() { From 8f3d64af5881af85cae8319e7dd81bf3a58aabe9 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 17:44:25 +0800 Subject: [PATCH 21/32] like and similar to Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 126 +++++------------- .../optimizer/src/analyzer/type_coercion.rs | 6 +- .../tests/cases/consumer_integration.rs | 4 +- 3 files changed, 37 insertions(+), 99 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 6eefb318d271..25106d266ea2 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1189,6 +1189,20 @@ impl Expr { Expr::IsNotFalse(expr) => { write!(&mut s, "{} IS NOT FALSE", expr.schema_name()?)? } + Expr::Like(Like { negated, expr, pattern, escape_char, case_insensitive }) => { + write!( + &mut s, + "{} {}{} {}", + expr.schema_name()?, + if *negated { "NOT " } else { "" }, + if *case_insensitive { "ILIKE" } else { "LIKE" }, + pattern.schema_name()?, + )?; + + if let Some(char) = escape_char { + write!(&mut s, " CHAR '{char}'")?; + } + } Expr::Negative(expr) => write!(&mut s, "(- {})", expr.schema_name()?)?, Expr::Not(expr) => write!(&mut s, "NOT {}", expr.schema_name()?)?, Expr::Unnest(Unnest { expr }) => { @@ -1200,6 +1214,22 @@ impl Expr { Expr::ScalarSubquery(Subquery { subquery, .. }) => { write!(&mut s, "{}", subquery.schema().field(0).name())?; } + Expr::SimilarTo(Like { negated, expr, pattern, escape_char, case_insensitive }) => { + write!( + &mut s, + "{} {} {}", + expr.schema_name()?, + if *negated { + "NOT SIMILAR TO" + } else { + "SIMILAR TO" + }, + pattern.schema_name()?, + )?; + if let Some(char) = escape_char { + write!(&mut s, " CHAR '{char}'")?; + } + } Expr::WindowFunction(WindowFunction { fun, args, @@ -2247,21 +2277,6 @@ fn fmt_function( write!(f, "{}({}{})", fun, distinct_str, args.join(", ")) } -fn write_function_name( - w: &mut W, - fun: &str, - distinct: bool, - args: &[Expr], -) -> Result<()> { - write!(w, "{}(", fun)?; - if distinct { - w.write_str("DISTINCT ")?; - } - write_names_join(w, args, ",")?; - w.write_str(")")?; - Ok(()) -} - /// Returns a readable name of an expression based on the input schema. /// This function recursively transverses the expression for names such as "CAST(a > 2)". pub(crate) fn create_name(e: &Expr) -> Result { @@ -2291,6 +2306,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::IsNull(_) | Expr::IsNotNull(_) | Expr::InSubquery(_) + | Expr::Like(_) | Expr::Negative(_) | Expr::Not(_) | Expr::OuterReferenceColumn(..) @@ -2298,52 +2314,12 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { | Expr::Unnest(_) | Expr::ScalarSubquery(..) | Expr::ScalarVariable(..) + | Expr::SimilarTo(_) | Expr::Sort(..) | Expr::Wildcard { .. } | Expr::WindowFunction(_) => write!(w, "{e}")?, Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, - Expr::Like(Like { - negated, - expr, - pattern, - escape_char, - case_insensitive, - }) => { - write!( - w, - "{} {}{} {}", - expr, - if *negated { "NOT " } else { "" }, - if *case_insensitive { "ILIKE" } else { "LIKE" }, - pattern, - )?; - if let Some(char) = escape_char { - write!(w, " CHAR '{char}'")?; - } - } - Expr::SimilarTo(Like { - negated, - expr, - pattern, - escape_char, - case_insensitive: _, - }) => { - write!( - w, - "{} {} {}", - expr, - if *negated { - "NOT SIMILAR TO" - } else { - "SIMILAR TO" - }, - pattern, - )?; - if let Some(char) = escape_char { - write!(w, " CHAR '{char}'")?; - } - } Expr::Cast(Cast { expr, .. }) => { // CAST does not change the expression name write_name(w, expr)?; @@ -2354,47 +2330,11 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { } Expr::ScalarFunction(fun) => { w.write_str(fun.func.display_name(&fun.args)?.as_str())?; - } // Expr::WindowFunction(WindowFunction { - // fun, - // args, - // window_frame, - // partition_by, - // order_by, - // null_treatment, - // }) => { - // write_function_name(w, &fun.to_string(), false, args)?; - - // if let Some(nt) = null_treatment { - // w.write_str(" ")?; - // write!(w, "{}", nt)?; - // } - // if !partition_by.is_empty() { - // w.write_str(" ")?; - // write!(w, "PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; - // } - // if !order_by.is_empty() { - // w.write_str(" ")?; - // write!(w, "ORDER BY [{}]", expr_vec_fmt!(order_by))?; - // } - // w.write_str(" ")?; - // write!(w, "{window_frame}")?; - // } + } }; Ok(()) } -fn write_names_join(w: &mut W, exprs: &[Expr], sep: &str) -> Result<()> { - let mut iter = exprs.iter(); - if let Some(first_arg) = iter.next() { - write_name(w, first_arg)?; - } - for a in iter { - w.write_str(sep)?; - write_name(w, a)?; - } - Ok(()) -} - #[cfg(test)] mod test { use crate::expr_fn::col; diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 57a5bcd2b791..ea0bdec124e0 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1148,8 +1148,7 @@ mod test { let like_expr = Expr::Like(Like::new(false, expr, pattern, None, false)); let empty = empty_with_type(DataType::Utf8); let plan = LogicalPlan::Projection(Projection::try_new(vec![like_expr], empty)?); - let expected = "Projection: a LIKE CAST(NULL AS Utf8) AS a LIKE NULL\ - \n EmptyRelation"; + let expected = "Projection: a LIKE CAST(NULL AS Utf8)\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; let expr = Box::new(col("a")); @@ -1177,8 +1176,7 @@ mod test { let ilike_expr = Expr::Like(Like::new(false, expr, pattern, None, true)); let empty = empty_with_type(DataType::Utf8); let plan = LogicalPlan::Projection(Projection::try_new(vec![ilike_expr], empty)?); - let expected = "Projection: a ILIKE CAST(NULL AS Utf8) AS a ILIKE NULL\ - \n EmptyRelation"; + let expected = "Projection: a ILIKE CAST(NULL AS Utf8)\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; let expr = Box::new(col("a")); diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 8fbcd721166e..4beef59d71ce 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -358,8 +358,8 @@ mod tests { let plan = from_substrait_plan(&ctx, &proto).await?; let plan_str = format!("{:?}", plan); - assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ - \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ + assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ + \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ \n Projection: CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount) ELSE Decimal128(Some(0),19,0) END, FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ \n Filter: FILENAME_PLACEHOLDER_0.l_partkey = FILENAME_PLACEHOLDER_1.p_partkey AND FILENAME_PLACEHOLDER_0.l_shipdate >= Date32(\"1995-09-01\") AND FILENAME_PLACEHOLDER_0.l_shipdate < CAST(Utf8(\"1995-10-01\") AS Date32)\ \n Inner Join: Filter: Boolean(true)\ From 8adde1973b38334e873d09a12062578c0998576c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 17:59:20 +0800 Subject: [PATCH 22/32] rm display name Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 104 ++++++---------------- datafusion/expr/src/udf.rs | 3 +- datafusion/functions/src/core/getfield.rs | 2 +- 3 files changed, 30 insertions(+), 79 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 25106d266ea2..ff16af5b955d 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -998,22 +998,24 @@ impl PartialOrd for Expr { } impl Expr { - /// Returns the name of this expression as it should appear in a schema. This name - /// will not include any CAST expressions. - // TODO: Deprecate it and use Display - pub fn display_name(&self) -> Result { - create_name(self) - } - - /// Returns the name for schema / field that is different from display - /// Most of the expressions are the same as display_name + /// Returns the name for schema / field that is different from Display + /// Most of the expressions are the same as Display /// Those are the main difference - /// 1. Alias - /// 2. Cast + /// 1. Alias, where excludes expression + /// 2. Cast / TryCast, where takes expression only pub fn schema_name(&self) -> Result { let mut s = String::new(); match self { + // The same as Display + Expr::Column(_) + | Expr::Literal(_) + | Expr::ScalarVariable(..) + | Expr::Sort(_) + | Expr::OuterReferenceColumn(..) + | Expr::Placeholder(_) + | Expr::Wildcard { .. } => write!(&mut s, "{self}")?, + Expr::AggregateFunction(AggregateFunction { func_def, args, @@ -1116,10 +1118,9 @@ impl Expr { write!(&mut s, "END")?; } // cast expr is not shown to be consistant with Postgres and Spark - Expr::Cast(Cast { expr, data_type: _ }) => { + Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) => { write!(&mut s, "{}", expr.schema_name()?)? } - Expr::Column(c) => write!(&mut s, "{c}")?, Expr::InList(InList { expr, list, @@ -1189,7 +1190,13 @@ impl Expr { Expr::IsNotFalse(expr) => { write!(&mut s, "{} IS NOT FALSE", expr.schema_name()?)? } - Expr::Like(Like { negated, expr, pattern, escape_char, case_insensitive }) => { + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { write!( &mut s, "{} {}{} {}", @@ -1214,7 +1221,13 @@ impl Expr { Expr::ScalarSubquery(Subquery { subquery, .. }) => { write!(&mut s, "{}", subquery.schema().field(0).name())?; } - Expr::SimilarTo(Like { negated, expr, pattern, escape_char, case_insensitive }) => { + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + .. + }) => { write!( &mut s, "{} {} {}", @@ -1269,8 +1282,6 @@ impl Expr { write!(&mut s, " {window_frame}")?; } - // other exprs has no difference - _ => write!(&mut s, "{}", self.display_name()?)?, } Ok(s) @@ -2277,64 +2288,6 @@ fn fmt_function( write!(f, "{}({}{})", fun, distinct_str, args.join(", ")) } -/// Returns a readable name of an expression based on the input schema. -/// This function recursively transverses the expression for names such as "CAST(a > 2)". -pub(crate) fn create_name(e: &Expr) -> Result { - let mut s = String::new(); - write_name(&mut s, e)?; - Ok(s) -} - -fn write_name(w: &mut W, e: &Expr) -> Result<()> { - match e { - // Reuse Display trait - Expr::AggregateFunction(_) - | Expr::Between(_) - | Expr::BinaryExpr(_) - | Expr::Case(_) - | Expr::Column(_) - | Expr::Exists(_) - | Expr::GroupingSet(_) - | Expr::Literal(_) - | Expr::InList(_) - | Expr::IsFalse(_) - | Expr::IsNotFalse(_) - | Expr::IsTrue(_) - | Expr::IsNotTrue(_) - | Expr::IsUnknown(_) - | Expr::IsNotUnknown(_) - | Expr::IsNull(_) - | Expr::IsNotNull(_) - | Expr::InSubquery(_) - | Expr::Like(_) - | Expr::Negative(_) - | Expr::Not(_) - | Expr::OuterReferenceColumn(..) - | Expr::Placeholder(_) - | Expr::Unnest(_) - | Expr::ScalarSubquery(..) - | Expr::ScalarVariable(..) - | Expr::SimilarTo(_) - | Expr::Sort(..) - | Expr::Wildcard { .. } - | Expr::WindowFunction(_) => write!(w, "{e}")?, - - Expr::Alias(Alias { name, .. }) => write!(w, "{name}")?, - Expr::Cast(Cast { expr, .. }) => { - // CAST does not change the expression name - write_name(w, expr)?; - } - Expr::TryCast(TryCast { expr, .. }) => { - // CAST does not change the expression name - write_name(w, expr)?; - } - Expr::ScalarFunction(fun) => { - w.write_str(fun.func.display_name(&fun.args)?.as_str())?; - } - }; - Ok(()) -} - #[cfg(test)] mod test { use crate::expr_fn::col; @@ -2350,7 +2303,6 @@ mod test { let expected = "CASE a WHEN Int32(1) THEN Boolean(true) WHEN Int32(0) THEN Boolean(false) ELSE NULL END"; assert_eq!(expected, expr.canonical_name()); assert_eq!(expected, format!("{expr}")); - assert_eq!(expected, expr.display_name()?); Ok(()) } diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 8cdb9a9dfcb6..4dc58f3b779e 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -26,7 +26,6 @@ use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; -use crate::expr::create_name; use crate::interval_arithmetic::Interval; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; @@ -353,7 +352,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Returns the user-defined display name of the UDF given the arguments fn display_name(&self, args: &[Expr]) -> Result { - let names: Vec = args.iter().map(create_name).collect::>()?; + let names: Vec = args.iter().map(ToString::to_string).collect(); // TODO: join with ", " to standardize the formatting of Vec, Ok(format!("{}({})", self.name(), names.join(","))) } diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 66caa092f103..a68652f98834 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -74,7 +74,7 @@ impl ScalarUDFImpl for GetFieldFunc { } }; - Ok(format!("{}[{}]", args[0].display_name()?, name)) + Ok(format!("{}[{}]", args[0], name)) } fn schema_name(&self, args: &[Expr]) -> Result { From 820b3a5b4a4054369e72454ead7e92da3f646072 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 18:04:30 +0800 Subject: [PATCH 23/32] comment Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ff16af5b955d..c6ed022c1922 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -2124,8 +2124,8 @@ impl fmt::Display for Expr { Expr::ScalarFunction(fun) => { fmt_function(f, fun.name(), false, &fun.args, true) } - // TODO: use udf's display_name - // Expr::ScalarFunction(s) => { + // TODO: use udf's display_name, need to fix the seperator issue, + // Expr::ScalarFunction(ScalarFunction { func, args }) => { // write!(f, "{}", func.display_name(args).unwrap()) // } Expr::WindowFunction(WindowFunction { @@ -2262,7 +2262,7 @@ impl fmt::Display for Expr { }, Expr::Placeholder(Placeholder { id, .. }) => write!(f, "{id}"), Expr::Unnest(Unnest { expr }) => { - // TODO: use Display instead of Debug + // TODO: use Display instead of Debug, there is non-unique expression name in projection issue. write!(f, "UNNEST({expr:?})") } } From 89d3da88cccc14b9fe316412e0bafc257708319e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 18:35:09 +0800 Subject: [PATCH 24/32] cliip Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index c6ed022c1922..efc7d647933b 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1150,7 +1150,7 @@ impl Expr { } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { write!(&mut s, "GROUPING SETS (")?; - for (_, exprs) in lists_of_exprs.iter().enumerate() { + for exprs in lists_of_exprs.iter() { let exprs_name = exprs .iter() .map(Self::schema_name) From 5f8075353f08bf4ff52ec4550d4c152339085fff Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 19:25:36 +0800 Subject: [PATCH 25/32] fix doc Signed-off-by: jayzhan211 --- datafusion/functions-nested/src/expr_ext.rs | 4 ++-- datafusion/functions/src/core/expr_ext.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-nested/src/expr_ext.rs b/datafusion/functions-nested/src/expr_ext.rs index 3524d62d0bc4..0647b187f5ab 100644 --- a/datafusion/functions-nested/src/expr_ext.rs +++ b/datafusion/functions-nested/src/expr_ext.rs @@ -38,7 +38,7 @@ use crate::extract::{array_element, array_slice}; /// # use datafusion_functions_nested::expr_ext::IndexAccessor; /// let expr = col("c1") /// .index(lit(3)); -/// assert_eq!(expr.display_name().unwrap(), "c1[Int32(3)]"); +/// assert_eq!(expr.schema_name().unwrap(), "c1[Int32(3)]"); /// ``` pub trait IndexAccessor { fn index(self, key: Expr) -> Expr; @@ -68,7 +68,7 @@ impl IndexAccessor for Expr { /// # use datafusion_functions_nested::expr_ext::SliceAccessor; /// let expr = col("c1") /// .range(lit(2), lit(4)); -/// assert_eq!(expr.display_name().unwrap(), "c1[Int32(2):Int32(4)]"); +/// assert_eq!(expr.schema_name().unwrap(), "c1[Int32(2):Int32(4)]"); /// ``` pub trait SliceAccessor { fn range(self, start: Expr, stop: Expr) -> Expr; diff --git a/datafusion/functions/src/core/expr_ext.rs b/datafusion/functions/src/core/expr_ext.rs index d80df0f334ab..226f873bb0b5 100644 --- a/datafusion/functions/src/core/expr_ext.rs +++ b/datafusion/functions/src/core/expr_ext.rs @@ -41,7 +41,7 @@ use super::expr_fn::get_field; /// # use datafusion_functions::core::expr_ext::FieldAccessor; /// let expr = col("c1") /// .field("my_field"); -/// assert_eq!(expr.display_name().unwrap(), "c1[my_field]"); +/// assert_eq!(expr.schema_name().unwrap(), "c1[my_field]"); /// ``` pub trait FieldAccessor { fn field(self, name: impl Literal) -> Expr; From c346d76c22df9bf5f419a9268a0ba4bb8b5365b6 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 4 Aug 2024 20:27:55 +0800 Subject: [PATCH 26/32] display Signed-off-by: jayzhan211 --- datafusion/substrait/tests/cases/consumer_integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 9532f788d5ad..099e1fe2146c 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -357,7 +357,7 @@ mod tests { .expect("failed to parse json"); let plan = from_substrait_plan(&ctx, &proto).await?; - let plan_str = format!("{:?}", plan); + let plan_str = format!("{}", plan); assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ \n Projection: CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount) ELSE Decimal128(Some(0),19,0) END, FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ From 293117aab7ee9fb1e2a59ea023b350f40516cbaf Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 5 Aug 2024 08:25:19 +0800 Subject: [PATCH 27/32] fix conflict Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index bcdd2262cfc9..a8d4444e78c2 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1001,7 +1001,7 @@ impl Expr { | Expr::Wildcard { .. } => write!(&mut s, "{self}")?, Expr::AggregateFunction(AggregateFunction { - func_def, + func, args, distinct, filter, @@ -1013,16 +1013,13 @@ impl Expr { .map(Self::schema_name) .collect::>>()?; // TODO: join with ", " to standardize the formatting of Vec, - if *distinct { - write!( - &mut s, - "{}(DISTINCT {})", - func_def.name(), - args_name.join(",") - )?; - } else { - write!(&mut s, "{}({})", func_def.name(), args_name.join(","))?; - } + write!( + &mut s, + "{}({}{})", + func.name(), + if *distinct { "DISTINCT " } else { "" }, + args_name.join(",") + )?; if let Some(null_treatment) = null_treatment { write!(&mut s, " {}", null_treatment)?; From 484e1ad008d64eb97020bf8f58980d1776a5340c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 6 Aug 2024 13:16:16 +0800 Subject: [PATCH 28/32] fix merge conflict Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 259 ++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ca0b0440f6fe..27099349d7ac 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -2270,6 +2270,265 @@ fn fmt_function( write!(f, "{}({}{})", fun, distinct_str, args.join(", ")) } +pub fn create_function_physical_name( + fun: &str, + distinct: bool, + args: &[Expr], + order_by: Option<&Vec>, +) -> Result { + let names: Vec = args + .iter() + .map(|e| create_physical_name(e, false)) + .collect::>()?; + + let distinct_str = match distinct { + true => "DISTINCT ", + false => "", + }; + + let phys_name = format!("{}({}{})", fun, distinct_str, names.join(",")); + + Ok(order_by + .map(|order_by| format!("{} ORDER BY [{}]", phys_name, expr_vec_fmt!(order_by))) + .unwrap_or(phys_name)) +} + +pub fn physical_name(e: &Expr) -> Result { + create_physical_name(e, true) +} + +fn create_physical_name(e: &Expr, is_first_expr: bool) -> Result { + match e { + Expr::Unnest(_) => { + internal_err!( + "Expr::Unnest should have been converted to LogicalPlan::Unnest" + ) + } + Expr::Column(c) => { + if is_first_expr { + Ok(c.name.clone()) + } else { + Ok(c.flat_name()) + } + } + Expr::Alias(Alias { name, .. }) => Ok(name.clone()), + Expr::ScalarVariable(_, variable_names) => Ok(variable_names.join(".")), + Expr::Literal(value) => Ok(format!("{value:?}")), + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + let left = create_physical_name(left, false)?; + let right = create_physical_name(right, false)?; + Ok(format!("{left} {op} {right}")) + } + Expr::Case(case) => { + let mut name = "CASE ".to_string(); + if let Some(e) = &case.expr { + let _ = write!(name, "{} ", create_physical_name(e, false)?); + } + for (w, t) in &case.when_then_expr { + let _ = write!( + name, + "WHEN {} THEN {} ", + create_physical_name(w, false)?, + create_physical_name(t, false)? + ); + } + if let Some(e) = &case.else_expr { + let _ = write!(name, "ELSE {} ", create_physical_name(e, false)?); + } + name += "END"; + Ok(name) + } + Expr::Cast(Cast { expr, .. }) => { + // CAST does not change the expression name + create_physical_name(expr, false) + } + Expr::TryCast(TryCast { expr, .. }) => { + // CAST does not change the expression name + create_physical_name(expr, false) + } + Expr::Not(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("NOT {expr}")) + } + Expr::Negative(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("(- {expr})")) + } + Expr::IsNull(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS NULL")) + } + Expr::IsNotNull(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS NOT NULL")) + } + Expr::IsTrue(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS TRUE")) + } + Expr::IsFalse(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS FALSE")) + } + Expr::IsUnknown(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS UNKNOWN")) + } + Expr::IsNotTrue(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS NOT TRUE")) + } + Expr::IsNotFalse(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS NOT FALSE")) + } + Expr::IsNotUnknown(expr) => { + let expr = create_physical_name(expr, false)?; + Ok(format!("{expr} IS NOT UNKNOWN")) + } + Expr::ScalarFunction(fun) => fun.func.schema_name(&fun.args), + Expr::WindowFunction(WindowFunction { + fun, + args, + order_by, + .. + }) => { + create_function_physical_name(&fun.to_string(), false, args, Some(order_by)) + } + Expr::AggregateFunction(AggregateFunction { + func, + distinct, + args, + filter: _, + order_by, + null_treatment: _, + }) => { + create_function_physical_name(func.name(), *distinct, args, order_by.as_ref()) + } + Expr::GroupingSet(grouping_set) => match grouping_set { + GroupingSet::Rollup(exprs) => Ok(format!( + "ROLLUP ({})", + exprs + .iter() + .map(|e| create_physical_name(e, false)) + .collect::>>()? + .join(", ") + )), + GroupingSet::Cube(exprs) => Ok(format!( + "CUBE ({})", + exprs + .iter() + .map(|e| create_physical_name(e, false)) + .collect::>>()? + .join(", ") + )), + GroupingSet::GroupingSets(lists_of_exprs) => { + let mut strings = vec![]; + for exprs in lists_of_exprs { + let exprs_str = exprs + .iter() + .map(|e| create_physical_name(e, false)) + .collect::>>()? + .join(", "); + strings.push(format!("({exprs_str})")); + } + Ok(format!("GROUPING SETS ({})", strings.join(", "))) + } + }, + + Expr::InList(InList { + expr, + list, + negated, + }) => { + let expr = create_physical_name(expr, false)?; + let list = list.iter().map(|expr| create_physical_name(expr, false)); + if *negated { + Ok(format!("{expr} NOT IN ({list:?})")) + } else { + Ok(format!("{expr} IN ({list:?})")) + } + } + Expr::Exists { .. } => { + not_impl_err!("EXISTS is not yet supported in the physical plan") + } + Expr::InSubquery(_) => { + not_impl_err!("IN subquery is not yet supported in the physical plan") + } + Expr::ScalarSubquery(_) => { + not_impl_err!("Scalar subqueries are not yet supported in the physical plan") + } + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { + let expr = create_physical_name(expr, false)?; + let low = create_physical_name(low, false)?; + let high = create_physical_name(high, false)?; + if *negated { + Ok(format!("{expr} NOT BETWEEN {low} AND {high}")) + } else { + Ok(format!("{expr} BETWEEN {low} AND {high}")) + } + } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + let expr = create_physical_name(expr, false)?; + let pattern = create_physical_name(pattern, false)?; + let op_name = if *case_insensitive { "ILIKE" } else { "LIKE" }; + let escape = if let Some(char) = escape_char { + format!("CHAR '{char}'") + } else { + "".to_string() + }; + if *negated { + Ok(format!("{expr} NOT {op_name} {pattern}{escape}")) + } else { + Ok(format!("{expr} {op_name} {pattern}{escape}")) + } + } + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive: _, + }) => { + let expr = create_physical_name(expr, false)?; + let pattern = create_physical_name(pattern, false)?; + let escape = if let Some(char) = escape_char { + format!("CHAR '{char}'") + } else { + "".to_string() + }; + if *negated { + Ok(format!("{expr} NOT SIMILAR TO {pattern}{escape}")) + } else { + Ok(format!("{expr} SIMILAR TO {pattern}{escape}")) + } + } + Expr::Sort { .. } => { + internal_err!("Create physical name does not support sort expression") + } + Expr::Wildcard { .. } => { + internal_err!("Create physical name does not support wildcard") + } + Expr::Placeholder(_) => { + internal_err!("Create physical name does not support placeholder") + } + Expr::OuterReferenceColumn(_, _) => { + internal_err!("Create physical name does not support OuterReferenceColumn") + } + } +} + #[cfg(test)] mod test { use crate::expr_fn::col; From 2b69fe905d683431cdee62b98e0edcac087263d5 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 7 Aug 2024 16:45:14 +0800 Subject: [PATCH 29/32] with display Signed-off-by: jayzhan211 --- datafusion/core/src/physical_planner.rs | 4 +- datafusion/expr/src/expr.rs | 581 +++++++++--------- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/expr/src/expr_rewriter/order_by.rs | 2 +- datafusion/expr/src/expr_schema.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 10 +- datafusion/expr/src/logical_plan/plan.rs | 4 +- datafusion/expr/src/udf.rs | 4 +- datafusion/expr/src/utils.rs | 2 +- datafusion/functions-nested/src/extract.rs | 8 +- datafusion/functions/src/core/getfield.rs | 2 +- .../optimizer/src/common_subexpr_eliminate.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 7 +- .../optimizer/src/optimize_projections/mod.rs | 8 +- datafusion/optimizer/src/push_down_filter.rs | 4 +- .../optimizer/src/scalar_subquery_to_join.rs | 4 +- .../src/single_distinct_to_groupby.rs | 2 +- datafusion/sql/src/unparser/utils.rs | 2 +- datafusion/sql/src/utils.rs | 4 +- datafusion/sqllogictest/test_files/array.slt | 4 +- .../substrait/src/logical_plan/consumer.rs | 2 +- 21 files changed, 340 insertions(+), 320 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index adb1f639675f..89487c4e7a79 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1529,7 +1529,7 @@ pub fn create_window_expr( // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()), - _ => (e.schema_name()?, e), + _ => (e.schema_name().to_string(), e), }; create_window_expr_with_name(e, name, logical_schema, execution_props) } @@ -1620,7 +1620,7 @@ pub fn create_aggregate_expr_and_maybe_filter( // unpack (nested) aliased logical expressions, e.g. "sum(col) as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (Some(name.clone()), expr.as_ref()), - Expr::AggregateFunction(_) => (e.schema_name().ok(), e), + Expr::AggregateFunction(_) => (Some(e.schema_name().to_string()), e), _ => (None, e), }; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 27099349d7ac..f58b68be4ec0 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -988,285 +988,8 @@ impl Expr { /// Those are the main difference /// 1. Alias, where excludes expression /// 2. Cast / TryCast, where takes expression only - pub fn schema_name(&self) -> Result { - let mut s = String::new(); - - match self { - // The same as Display - Expr::Column(_) - | Expr::Literal(_) - | Expr::ScalarVariable(..) - | Expr::Sort(_) - | Expr::OuterReferenceColumn(..) - | Expr::Placeholder(_) - | Expr::Wildcard { .. } => write!(&mut s, "{self}")?, - - Expr::AggregateFunction(AggregateFunction { - func, - args, - distinct, - filter, - order_by, - null_treatment, - }) => { - let args_name = args - .iter() - .map(Self::schema_name) - .collect::>>()?; - // TODO: join with ", " to standardize the formatting of Vec, - write!( - &mut s, - "{}({}{})", - func.name(), - if *distinct { "DISTINCT " } else { "" }, - args_name.join(",") - )?; - - if let Some(null_treatment) = null_treatment { - write!(&mut s, " {}", null_treatment)?; - } - - if let Some(filter) = filter { - write!(&mut s, " FILTER (WHERE {})", filter.schema_name()?)?; - }; - - if let Some(order_by) = order_by { - let order_by_name = order_by - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, " ORDER BY [{}]", order_by_name)?; - }; - } - // expr is not shown since it is aliased - Expr::Alias(Alias { name, .. }) => write!(&mut s, "{name}")?, - Expr::Between(Between { - expr, - negated, - low, - high, - }) => { - if *negated { - write!( - &mut s, - "{} NOT BETWEEN {} AND {}", - expr.schema_name()?, - low.schema_name()?, - high.schema_name()? - )?; - } else { - write!( - &mut s, - "{} BETWEEN {} AND {}", - expr.schema_name()?, - low.schema_name()?, - high.schema_name()? - )?; - } - } - Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - write!( - &mut s, - "{} {op} {}", - left.schema_name()?, - right.schema_name()? - )?; - } - Expr::Case(Case { - expr, - when_then_expr, - else_expr, - }) => { - write!(&mut s, "CASE ")?; - - if let Some(e) = expr { - write!(&mut s, "{} ", e.schema_name()?)?; - } - - for (when, then) in when_then_expr { - write!( - &mut s, - "WHEN {} THEN {} ", - when.schema_name()?, - then.schema_name()? - )?; - } - - if let Some(e) = else_expr { - write!(&mut s, "ELSE {} ", e.schema_name()?)?; - } - - write!(&mut s, "END")?; - } - // cast expr is not shown to be consistant with Postgres and Spark - Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) => { - write!(&mut s, "{}", expr.schema_name()?)? - } - Expr::InList(InList { - expr, - list, - negated, - }) => { - let inlist_exprs = list - .iter() - .map(Self::schema_name) - .collect::>>()?; - let inlist_name = inlist_exprs.join(", "); - - if *negated { - write!(&mut s, "{} NOT IN {}", expr.schema_name()?, inlist_name)?; - } else { - write!(&mut s, "{} IN {}", expr.schema_name()?, inlist_name)?; - } - } - Expr::Exists(Exists { negated: true, .. }) => write!(&mut s, "NOT EXISTS")?, - Expr::Exists(Exists { negated: false, .. }) => write!(&mut s, "EXISTS")?, - Expr::GroupingSet(GroupingSet::Cube(exprs)) => { - let exprs_name = exprs - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, "ROLLUP ({})", exprs_name)?; - } - Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { - write!(&mut s, "GROUPING SETS (")?; - for exprs in lists_of_exprs.iter() { - let exprs_name = exprs - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, "({})", exprs_name)?; - } - write!(&mut s, ")")?; - } - Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { - let exprs_name = exprs - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, "ROLLUP ({})", exprs_name)?; - } - Expr::IsNull(expr) => write!(&mut s, "{} IS NULL", expr.schema_name()?)?, - Expr::IsNotNull(expr) => { - write!(&mut s, "{} IS NOT NULL", expr.schema_name()?)? - } - Expr::IsUnknown(expr) => { - write!(&mut s, "{} IS UNKNOWN", expr.schema_name()?)? - } - Expr::IsNotUnknown(expr) => { - write!(&mut s, "{} IS NOT UNKNOWN", expr.schema_name()?)? - } - Expr::InSubquery(InSubquery { negated: true, .. }) => { - write!(&mut s, "NOT IN")? - } - Expr::InSubquery(InSubquery { negated: false, .. }) => write!(&mut s, "IN")?, - Expr::IsTrue(expr) => write!(&mut s, "{} IS TRUE", expr.schema_name()?)?, - Expr::IsFalse(expr) => write!(&mut s, "{} IS FALSE", expr.schema_name()?)?, - Expr::IsNotTrue(expr) => { - write!(&mut s, "{} IS NOT TRUE", expr.schema_name()?)? - } - Expr::IsNotFalse(expr) => { - write!(&mut s, "{} IS NOT FALSE", expr.schema_name()?)? - } - Expr::Like(Like { - negated, - expr, - pattern, - escape_char, - case_insensitive, - }) => { - write!( - &mut s, - "{} {}{} {}", - expr.schema_name()?, - if *negated { "NOT " } else { "" }, - if *case_insensitive { "ILIKE" } else { "LIKE" }, - pattern.schema_name()?, - )?; - - if let Some(char) = escape_char { - write!(&mut s, " CHAR '{char}'")?; - } - } - Expr::Negative(expr) => write!(&mut s, "(- {})", expr.schema_name()?)?, - Expr::Not(expr) => write!(&mut s, "NOT {}", expr.schema_name()?)?, - Expr::Unnest(Unnest { expr }) => { - write!(&mut s, "UNNEST({})", expr.schema_name()?)? - } - Expr::ScalarFunction(ScalarFunction { func, args }) => { - write!(&mut s, "{}", func.schema_name(args)?)? - } - Expr::ScalarSubquery(Subquery { subquery, .. }) => { - write!(&mut s, "{}", subquery.schema().field(0).name())?; - } - Expr::SimilarTo(Like { - negated, - expr, - pattern, - escape_char, - .. - }) => { - write!( - &mut s, - "{} {} {}", - expr.schema_name()?, - if *negated { - "NOT SIMILAR TO" - } else { - "SIMILAR TO" - }, - pattern.schema_name()?, - )?; - if let Some(char) = escape_char { - write!(&mut s, " CHAR '{char}'")?; - } - } - Expr::WindowFunction(WindowFunction { - fun, - args, - partition_by, - order_by, - window_frame, - null_treatment, - }) => { - let args_name = args - .iter() - .map(Self::schema_name) - .collect::>>()?; - // TODO: join with ", " to standardize the formatting of Vec, - write!(&mut s, "{}({})", fun, args_name.join(","))?; - - if let Some(null_treatment) = null_treatment { - write!(&mut s, " {}", null_treatment)?; - } - - if !partition_by.is_empty() { - let partition_by_name = partition_by - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, " PARTITION BY [{}]", partition_by_name)?; - } - - if !order_by.is_empty() { - let order_by_name = order_by - .iter() - .map(Self::schema_name) - .collect::>>()? - .join(", "); - write!(&mut s, " ORDER BY [{}]", order_by_name)?; - }; - - write!(&mut s, " {window_frame}")?; - } - } - - Ok(s) + pub fn schema_name<'a>(&'a self) -> impl Display + 'a { + SchemaDisplay(self) } /// Returns a full and complete string representation of this expression. @@ -1399,7 +1122,7 @@ impl Expr { match self { // call Expr::display_name() on a Expr::Sort will throw an error Expr::Sort(Sort { expr, .. }) => expr.name_for_alias(), - expr => expr.schema_name(), + expr => Ok(expr.schema_name().to_string()), } } @@ -2028,6 +1751,302 @@ macro_rules! expr_vec_fmt { }}; } +struct SchemaDisplay<'a>(&'a Expr); +impl<'a> Display for SchemaDisplay<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self.0 { + // The same as Display + Expr::Column(_) + | Expr::Literal(_) + | Expr::ScalarVariable(..) + | Expr::Sort(_) + | Expr::OuterReferenceColumn(..) + | Expr::Placeholder(_) + | Expr::Wildcard { .. } => write!(f, "{}", self.0), + + Expr::AggregateFunction(AggregateFunction { + func, + args, + distinct, + filter, + order_by, + null_treatment, + }) => { + // let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); + let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); + + write!( + f, + "{}({}{})", + func.name(), + if *distinct { "DISTINCT " } else { "" }, + args_name.join(",") + )?; + + if let Some(null_treatment) = null_treatment { + write!(f, " {}", null_treatment)?; + } + + if let Some(filter) = filter { + write!(f, " FILTER (WHERE {filter})")?; + }; + + if let Some(order_by) = order_by { + let order_by_name = order_by + .iter() + .map(|e| format!("{e}")) + .collect::>() + .join(", "); + write!(f, " ORDER BY [{}]", order_by_name)?; + }; + + Ok(()) + } + // expr is not shown since it is aliased + Expr::Alias(Alias { name, .. }) => write!(f, "{name}"), + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { + if *negated { + write!( + f, + "{} NOT BETWEEN {} AND {}", + SchemaDisplay(expr), + SchemaDisplay(low), + SchemaDisplay(high), + ) + } else { + write!( + f, + "{} BETWEEN {} AND {}", + SchemaDisplay(expr), + SchemaDisplay(low), + SchemaDisplay(high), + ) + } + } + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + write!( + f, + "{} {op} {}", + SchemaDisplay(left), + SchemaDisplay(right), + ) + } + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { + write!(f, "CASE ")?; + + if let Some(e) = expr { + write!(f, "{} ", SchemaDisplay(e))?; + } + + for (when, then) in when_then_expr { + write!( + f, + "WHEN {} THEN {} ", + SchemaDisplay(when), + SchemaDisplay(then), + )?; + } + + if let Some(e) = else_expr { + write!(f, "ELSE {} ", SchemaDisplay(e))?; + } + + write!(f, "END") + } + // cast expr is not shown to be consistant with Postgres and Spark + Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) => { + write!(f, "{}", SchemaDisplay(expr)) + } + Expr::InList(InList { + expr, + list, + negated, + }) => { + let inlist_exprs = list + .iter() + .map(|e| format!("{}", SchemaDisplay(e))) + .collect::>(); + let inlist_name = inlist_exprs.join(", "); + + if *negated { + write!(f, "{} NOT IN {}", SchemaDisplay(expr), inlist_name) + } else { + write!(f, "{} IN {}", SchemaDisplay(expr), inlist_name) + } + } + Expr::Exists(Exists { negated: true, .. }) => write!(f, "NOT EXISTS"), + Expr::Exists(Exists { negated: false, .. }) => write!(f, "EXISTS"), + Expr::GroupingSet(GroupingSet::Cube(exprs)) => { + let exprs_name = exprs + .iter() + .map(|e| format!("{}", SchemaDisplay(e))) + .collect::>() + .join(", "); + write!(f, "ROLLUP ({})", exprs_name) + } + Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { + write!(f, "GROUPING SETS (")?; + for exprs in lists_of_exprs.iter() { + let exprs_name = exprs + .iter() + .map(|e| format!("{}", SchemaDisplay(e))) + .collect::>() + .join(", "); + write!(f, "({})", exprs_name)?; + } + write!(f, ")") + } + Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { + let exprs_name = exprs + .iter() + .map(|e| format!("{}", SchemaDisplay(e))) + .collect::>() + .join(", "); + write!(f, "ROLLUP ({})", exprs_name) + } + Expr::IsNull(expr) => write!(f, "{} IS NULL", SchemaDisplay(expr)), + Expr::IsNotNull(expr) => { + write!(f, "{} IS NOT NULL", SchemaDisplay(expr)) + } + Expr::IsUnknown(expr) => { + write!(f, "{} IS UNKNOWN", SchemaDisplay(expr)) + } + Expr::IsNotUnknown(expr) => { + write!(f, "{} IS NOT UNKNOWN", SchemaDisplay(expr)) + } + Expr::InSubquery(InSubquery { negated: true, .. }) => { + write!(f, "NOT IN") + } + Expr::InSubquery(InSubquery { negated: false, .. }) => write!(f, "IN"), + Expr::IsTrue(expr) => write!(f, "{} IS TRUE", SchemaDisplay(expr)), + Expr::IsFalse(expr) => write!(f, "{} IS FALSE", SchemaDisplay(expr)), + Expr::IsNotTrue(expr) => { + write!(f, "{} IS NOT TRUE", SchemaDisplay(expr)) + } + Expr::IsNotFalse(expr) => { + write!(f, "{} IS NOT FALSE", SchemaDisplay(expr)) + } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + write!( + f, + "{} {}{} {}", + SchemaDisplay(expr), + if *negated { "NOT " } else { "" }, + if *case_insensitive { "ILIKE" } else { "LIKE" }, + SchemaDisplay(pattern), + )?; + + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::Negative(expr) => write!(f, "(- {})", SchemaDisplay(expr)), + Expr::Not(expr) => write!(f, "NOT {}", SchemaDisplay(expr)), + Expr::Unnest(Unnest { expr }) => { + write!(f, "UNNEST({})", SchemaDisplay(expr)) + } + Expr::ScalarFunction(ScalarFunction { func, args }) => { + match func.schema_name(args) { + Ok(name) => { + write!(f, "{name}") + } + Err(e) => { + write!(f, "got error from schema_name {}", e) + } + } + } + Expr::ScalarSubquery(Subquery { subquery, .. }) => { + write!(f, "{}", subquery.schema().field(0).name()) + } + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + .. + }) => { + write!( + f, + "{} {} {}", + SchemaDisplay(expr), + if *negated { + "NOT SIMILAR TO" + } else { + "SIMILAR TO" + }, + SchemaDisplay(pattern), + )?; + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::WindowFunction(WindowFunction { + fun, + args, + partition_by, + order_by, + window_frame, + null_treatment, + }) => { + let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); + + // TODO: join with ", " to standardize the formatting of Vec, + write!(f, "{}({})", fun, args_name.join(","))?; + + if let Some(null_treatment) = null_treatment { + write!(f, " {}", null_treatment)?; + } + + if !partition_by.is_empty() { + let partition_by_name = partition_by.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>().join(", "); + write!(f, " PARTITION BY [{}]", partition_by_name)?; + } + + if !order_by.is_empty() { + let order_by_name = order_by.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>().join(", "); + write!(f, " ORDER BY [{}]", order_by_name)?; + }; + + write!(f, " {window_frame}") + } + _ => todo!(""), + } + } +} + +impl<'a> SchemaDisplay<'a> { + fn fmt_args(f: &mut Formatter<'_>, args: &[Expr]) -> fmt::Result { + for (i, arg) in args.iter().enumerate() { + if i > 0 { + // TODO: Use ", " to standardize the formatting of Vec, + // + write!(f, ",")?; + } + write!(f, "{}", SchemaDisplay(arg))?; + } + Ok(()) + } +} + /// Format expressions for display as part of a logical plan. In many cases, this will produce /// similar output to `Expr.name()` except that column names will be prefixed with '#'. impl fmt::Display for Expr { @@ -2558,7 +2577,7 @@ mod test { assert_eq!(expected_canonical, format!("{expr}")); // note that CAST intentionally has a name that is different from its `Display` // representation. CAST does not change the name of expressions. - assert_eq!("Float32(1.23)", expr.schema_name()?); + assert_eq!("Float32(1.23)", expr.schema_name().to_string()); Ok(()) } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index fe2940314647..e677a333c163 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -173,7 +173,7 @@ pub fn create_col_from_scalar_expr( name, )), _ => { - let scalar_column = scalar_expr.schema_name()?; + let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( Some::(subqry_alias.into()), scalar_column, diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index a351c8521061..bbb855801c3e 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -109,7 +109,7 @@ fn rewrite_in_terms_of_projection( // expr is an actual expr like min(t.c2), but we are looking // for a column with the same "MIN(C2)", so translate there - let name = normalized_expr.schema_name()?; + let name = normalized_expr.schema_name().to_string(); let search_col = Expr::Column(Column { relation: None, diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 0e5eed35168b..1c645086c534 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -473,7 +473,7 @@ impl ExprSchemable for Expr { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( None, - Field::new(self.schema_name()?, data_type, nullable) + Field::new(self.schema_name().to_string(), data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index a227bcac3c9c..4ef346656ff4 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1298,15 +1298,15 @@ fn add_group_by_exprs_from_dependencies( // c1 + 1` produces an output field named `"c1 + 1"` let mut group_by_field_names = group_expr .iter() - .map(|e| e.schema_name()) - .collect::>>()?; + .map(|e| e.schema_name().to_string()) + .collect::>(); if let Some(target_indices) = get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { let expr = Expr::Column(Column::from(schema.qualified_field(idx))); - let expr_name = expr.schema_name()?; + let expr_name = expr.schema_name().to_string(); if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); group_expr.push(expr); @@ -1323,7 +1323,7 @@ pub(crate) fn validate_unique_names<'a>( let mut unique_names = HashMap::new(); expressions.into_iter().enumerate().try_for_each(|(position, expr)| { - let name = expr.schema_name()?; + let name = expr.schema_name().to_string(); match unique_names.get(&name) { None => { unique_names.insert(name, (position, expr)); @@ -1557,7 +1557,7 @@ pub fn wrap_projection_for_join_if_necessary( if let Some(col) = key.try_as_col() { Ok(col.clone()) } else { - let name = key.schema_name()?; + let name = key.schema_name().to_string(); Ok(Column::from_name(name)) } }) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0f5c9bbc1d78..c5538d8880a7 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2740,8 +2740,8 @@ fn calc_func_dependencies_for_aggregate( if !contains_grouping_set(group_expr) { let group_by_expr_names = group_expr .iter() - .map(|item| item.schema_name()) - .collect::>>()?; + .map(|item| item.schema_name().to_string()) + .collect::>(); let aggregate_func_dependencies = aggregate_functional_dependencies( input.schema(), &group_by_expr_names, diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 4dc58f3b779e..53e7eb213413 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -361,8 +361,8 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { fn schema_name(&self, args: &[Expr]) -> Result { let args_name = args .iter() - .map(Expr::schema_name) - .collect::>>()?; + .map(|e| e.schema_name().to_string()) + .collect::>(); // TODO: join with ", " to standardize the formatting of Vec, Ok(format!("{}({})", self.name(), args_name.join(","))) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 2a0b73773514..af507499d83e 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -798,7 +798,7 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; Ok(Expr::from(Column::from((qualifier, field)))) } - _ => Ok(Expr::Column(Column::from_name(expr.schema_name()?))), + _ => Ok(Expr::Column(Column::from_name(expr.schema_name().to_string()))), } } diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 1d95e55aaa44..b9e82f371369 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -108,8 +108,8 @@ impl ScalarUDFImpl for ArrayElement { fn schema_name(&self, args: &[Expr]) -> Result { let args_name = args .iter() - .map(Expr::schema_name) - .collect::>>()?; + .map(|e| e.schema_name().to_string()) + .collect::>(); if args_name.len() != 2 { return exec_err!("expect 2 args, got {}", args_name.len()); } @@ -278,8 +278,8 @@ impl ScalarUDFImpl for ArraySlice { fn schema_name(&self, args: &[Expr]) -> Result { let args_name = args .iter() - .map(Expr::schema_name) - .collect::>>()?; + .map(|e| e.schema_name().to_string()) + .collect::>(); if let Some((arr, indexes)) = args_name.split_first() { Ok(format!("{arr}[{}]", indexes.join(":"))) } else { diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index a68652f98834..a51f895c5084 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -94,7 +94,7 @@ impl ScalarUDFImpl for GetFieldFunc { } }; - Ok(format!("{}[{}]", args[0].schema_name()?, name)) + Ok(format!("{}[{}]", args[0].schema_name(), name)) } fn signature(&self) -> &Signature { diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index cf634624cd7c..45e5409ae9ac 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1108,7 +1108,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_, '_> { self.down_index += 1; } - let expr_name = expr.schema_name()?; + let expr_name = expr.schema_name().to_string(); let (_, expr_alias) = self.common_exprs.entry(expr_id).or_insert_with(|| { let expr_alias = self.alias_generator.next(CSE_PREFIX); diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 56091076a702..fd2077921b58 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -452,7 +452,7 @@ fn agg_exprs_evaluation_result_on_empty_batch( let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { - expr_result_map_for_count_bug.insert(e.schema_name()?, result_expr); + expr_result_map_for_count_bug.insert(e.schema_name().to_string(), result_expr); } } Ok(()) @@ -490,7 +490,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), Expr::Column(Column { relation: _, name }) => name.to_string(), - _ => expr.schema_name()?, + _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); } @@ -546,8 +546,9 @@ fn filter_exprs_evaluation_result_on_empty_batch( )], else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))), }); + let expr_key = new_expr.schema_name().to_string(); expr_result_map_for_count_bug - .insert(new_expr.schema_name()?, new_expr); + .insert(expr_key, new_expr); } None } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 099aa161b326..ac4ed87a4a1a 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -135,8 +135,8 @@ fn optimize_projections( let group_by_expr_existing = aggregate .group_expr .iter() - .map(|group_by_expr| group_by_expr.schema_name()) - .collect::>>()?; + .map(|group_by_expr| group_by_expr.schema_name().to_string()) + .collect::>(); let new_group_bys = if let Some(simplest_groupby_indices) = get_required_group_by_exprs_indices( @@ -1928,8 +1928,8 @@ mod tests { WindowFunctionDefinition::AggregateUDF(max_udaf()), vec![col("test.b")], )); - let col1 = col(max1.schema_name()?); - let col2 = col(max2.schema_name()?); + let col1 = col(max1.schema_name().to_string()); + let col2 = col(max2.schema_name().to_string()); let plan = LogicalPlanBuilder::from(table_scan) .window(vec![max1])? diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index ba3b698622ab..8455919c35a8 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -817,7 +817,7 @@ impl OptimizerRule for PushDownFilter { let group_expr_columns = agg .group_expr .iter() - .map(|e| Ok(Column::from_qualified_name(e.schema_name()?))) + .map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string()))) .collect::>>()?; let predicates = split_conjunction_owned(filter.predicate.clone()); @@ -838,7 +838,7 @@ impl OptimizerRule for PushDownFilter { // So we need create a replace_map, add {`a+b` --> Expr(Column(a)+Column(b))} let mut replace_map = HashMap::new(); for expr in &agg.group_expr { - replace_map.insert(expr.schema_name()?, expr.clone()); + replace_map.insert(expr.schema_name().to_string(), expr.clone()); } let replaced_push_predicates = push_predicates .into_iter() diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index cb195ca14e08..c79180b79256 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -188,9 +188,9 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut proj_exprs = vec![]; for expr in projection.expr.iter() { - let old_expr_name = expr.schema_name()?; + let old_expr_name = expr.schema_name().to_string(); let new_expr = expr_to_rewrite_expr_map.get(expr).unwrap(); - let new_expr_name = new_expr.schema_name()?; + let new_expr_name = new_expr.schema_name().to_string(); if new_expr_name != old_expr_name { proj_exprs.push(new_expr.clone().alias(old_expr_name)) } else { diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index ef71ba1530c0..30cae17eaf9f 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -194,7 +194,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { } let arg = args.swap_remove(0); - if group_fields_set.insert(arg.schema_name()?) { + if group_fields_set.insert(arg.schema_name().to_string()) { inner_group_exprs .push(arg.alias(SINGLE_DISTINCT_ALIAS)); } diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index 8d6af529cb0d..c1b3fe18f7e7 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -115,7 +115,7 @@ pub(crate) fn unproject_window_exprs(expr: &Expr, windows: &[&Window]) -> Result if let Some(unproj) = windows .iter() .flat_map(|w| w.window_expr.iter()) - .find(|window_expr| window_expr.schema_name().unwrap() == c.name) + .find(|window_expr| window_expr.schema_name().to_string() == c.name) { Ok(Transformed::yes(unproj.clone())) } else { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 0356727720cd..5cdc546e0267 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -329,7 +329,7 @@ pub(crate) fn transform_bottom_unnest( // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection // inside unnest execution, each column inside the inner projection // will be transformed into new columns. Thus we need to keep track of these placeholding column names - let placeholder_name = unnest_expr.schema_name()?; + let placeholder_name = unnest_expr.schema_name().to_string(); unnest_placeholder_columns.push(placeholder_name.clone()); // Add alias for the argument expression, to avoid naming conflicts @@ -402,7 +402,7 @@ pub(crate) fn transform_bottom_unnest( } else { // We need to evaluate the expr in the inner projection, // outer projection just select its name - let column_name = transformed_expr.schema_name()?; + let column_name = transformed_expr.schema_name().to_string(); inner_projection_exprs.push(transformed_expr); Ok(vec![Expr::Column(Column::from_name(column_name))]) } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 4b04d476dc01..d0f9ecf54204 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1137,7 +1137,7 @@ from arrays_values_without_nulls; ## array_element (aliases: array_extract, list_extract, list_element) # Testing with empty arguments should result in an error -query error DataFusion error: Execution error: expect 2 args, got 0 +query error DataFusion error: Error during planning: Error during planning: array_element does not support zero arguments select array_element(); # array_element error @@ -1979,7 +1979,7 @@ select array_slice(a, -1, 2, 1), array_slice(a, -1, 2), [6.0] [6.0] [] [] # Testing with empty arguments should result in an error -query error DataFusion error: Execution error: no argument +query error DataFusion error: Error during planning: Error during planning: array_slice does not support zero arguments select array_slice(); diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 10154edfdfd7..f2756bb06d1e 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -417,7 +417,7 @@ pub async fn from_substrait_rel( } // Ensure the expression has a unique display name, so that project's // validate_unique_names doesn't fail - let name = x.schema_name()?; + let name = x.schema_name().to_string(); let mut new_name = name.clone(); let mut i = 0; while names.contains(&new_name) { From 136e900c4285a235dc9e9d6e44bafc3aa0ae6cb8 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 7 Aug 2024 17:39:55 +0800 Subject: [PATCH 30/32] function for exprs Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 133 ++++++++++++----------- datafusion/expr/src/expr_rewriter/mod.rs | 14 +-- datafusion/expr/src/udf.rs | 16 +-- datafusion/expr/src/utils.rs | 4 +- datafusion/optimizer/src/decorrelate.rs | 6 +- 5 files changed, 91 insertions(+), 82 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index f58b68be4ec0..8e1e41d16413 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -983,11 +983,33 @@ impl PartialOrd for Expr { } impl Expr { - /// Returns the name for schema / field that is different from Display - /// Most of the expressions are the same as Display - /// Those are the main difference - /// 1. Alias, where excludes expression - /// 2. Cast / TryCast, where takes expression only + #[deprecated(since = "40.0.0", note = "use schema_name instead")] + pub fn display_name(&self) -> Result { + Ok(self.schema_name().to_string()) + } + + /// The name of the column (field) that this `Expr` will produce. + /// + /// For example, for a projection (e.g. `SELECT `) the resulting arrow + /// [`Schema`] will have a field with this name. + /// + /// Note that the resulting string is subtlety different than the `Display` + /// representation for certain `Expr`. Some differences: + /// + /// 1. [`Expr::Alias`], which shows only the alias itself + /// 2. [`Expr::Cast`] / [`Expr::TryCast`], which only displays the expression + /// + /// # Example + /// ``` + /// # use datafusion_expr::{col, lit}; + /// let expr = col("foo").eq(lit(42)); + /// assert_eq!("foo = Int32(42)", expr.schema_name().to_string()); + /// + /// let expr = col("foo").alias("bar").eq(lit(11)); + /// assert_eq!("bar = Int32(11)", expr.schema_name().to_string()); + /// ``` + /// + /// [`Schema`]: arrow::datatypes::Schema pub fn schema_name<'a>(&'a self) -> impl Display + 'a { SchemaDisplay(self) } @@ -1772,15 +1794,12 @@ impl<'a> Display for SchemaDisplay<'a> { order_by, null_treatment, }) => { - // let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); - let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); - write!( f, "{}({}{})", func.name(), if *distinct { "DISTINCT " } else { "" }, - args_name.join(",") + schema_name_from_exprs_comma_seperated_without_space(args)? )?; if let Some(null_treatment) = null_treatment { @@ -1792,12 +1811,7 @@ impl<'a> Display for SchemaDisplay<'a> { }; if let Some(order_by) = order_by { - let order_by_name = order_by - .iter() - .map(|e| format!("{e}")) - .collect::>() - .join(", "); - write!(f, " ORDER BY [{}]", order_by_name)?; + write!(f, " ORDER BY [{}]", schema_name_from_exprs(order_by)?)?; }; Ok(()) @@ -1829,12 +1843,7 @@ impl<'a> Display for SchemaDisplay<'a> { } } Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - write!( - f, - "{} {op} {}", - SchemaDisplay(left), - SchemaDisplay(right), - ) + write!(f, "{} {op} {}", SchemaDisplay(left), SchemaDisplay(right),) } Expr::Case(Case { expr, @@ -1871,11 +1880,7 @@ impl<'a> Display for SchemaDisplay<'a> { list, negated, }) => { - let inlist_exprs = list - .iter() - .map(|e| format!("{}", SchemaDisplay(e))) - .collect::>(); - let inlist_name = inlist_exprs.join(", "); + let inlist_name = schema_name_from_exprs(list)?; if *negated { write!(f, "{} NOT IN {}", SchemaDisplay(expr), inlist_name) @@ -1886,32 +1891,17 @@ impl<'a> Display for SchemaDisplay<'a> { Expr::Exists(Exists { negated: true, .. }) => write!(f, "NOT EXISTS"), Expr::Exists(Exists { negated: false, .. }) => write!(f, "EXISTS"), Expr::GroupingSet(GroupingSet::Cube(exprs)) => { - let exprs_name = exprs - .iter() - .map(|e| format!("{}", SchemaDisplay(e))) - .collect::>() - .join(", "); - write!(f, "ROLLUP ({})", exprs_name) + write!(f, "ROLLUP ({})", schema_name_from_exprs(exprs)?) } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { write!(f, "GROUPING SETS (")?; for exprs in lists_of_exprs.iter() { - let exprs_name = exprs - .iter() - .map(|e| format!("{}", SchemaDisplay(e))) - .collect::>() - .join(", "); - write!(f, "({})", exprs_name)?; + write!(f, "({})", schema_name_from_exprs(exprs)?)?; } write!(f, ")") } Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { - let exprs_name = exprs - .iter() - .map(|e| format!("{}", SchemaDisplay(e))) - .collect::>() - .join(", "); - write!(f, "ROLLUP ({})", exprs_name) + write!(f, "ROLLUP ({})", schema_name_from_exprs(exprs)?) } Expr::IsNull(expr) => write!(f, "{} IS NULL", SchemaDisplay(expr)), Expr::IsNotNull(expr) => { @@ -2007,44 +1997,61 @@ impl<'a> Display for SchemaDisplay<'a> { window_frame, null_treatment, }) => { - let args_name = args.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>(); - - // TODO: join with ", " to standardize the formatting of Vec, - write!(f, "{}({})", fun, args_name.join(","))?; + write!( + f, + "{}({})", + fun, + schema_name_from_exprs_comma_seperated_without_space(args)? + )?; if let Some(null_treatment) = null_treatment { write!(f, " {}", null_treatment)?; } if !partition_by.is_empty() { - let partition_by_name = partition_by.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>().join(", "); - write!(f, " PARTITION BY [{}]", partition_by_name)?; + write!( + f, + " PARTITION BY [{}]", + schema_name_from_exprs(partition_by)? + )?; } if !order_by.is_empty() { - let order_by_name = order_by.iter().map(|e| format!("{}", SchemaDisplay(e))).collect::>().join(", "); - write!(f, " ORDER BY [{}]", order_by_name)?; + write!(f, " ORDER BY [{}]", schema_name_from_exprs(order_by)?)?; }; write!(f, " {window_frame}") } - _ => todo!(""), } } } -impl<'a> SchemaDisplay<'a> { - fn fmt_args(f: &mut Formatter<'_>, args: &[Expr]) -> fmt::Result { - for (i, arg) in args.iter().enumerate() { - if i > 0 { - // TODO: Use ", " to standardize the formatting of Vec, - // - write!(f, ",")?; - } - write!(f, "{}", SchemaDisplay(arg))?; +/// Get schema_name for Vector of expressions +/// +/// Internal usage. Please call `schema_name_from_exprs` instead +// TODO: Use ", " to standardize the formatting of Vec, +// +pub(crate) fn schema_name_from_exprs_comma_seperated_without_space( + exprs: &[Expr], +) -> Result { + schema_name_from_exprs_inner(exprs, ",") +} + +/// Get schema_name for Vector of expressions +pub fn schema_name_from_exprs(exprs: &[Expr]) -> Result { + schema_name_from_exprs_inner(exprs, ", ") +} + +fn schema_name_from_exprs_inner(exprs: &[Expr], sep: &str) -> Result { + let mut s = String::new(); + for (i, e) in exprs.iter().enumerate() { + if i > 0 { + write!(&mut s, "{sep}")?; } - Ok(()) + write!(&mut s, "{}", SchemaDisplay(e))?; } + + Ok(s) } /// Format expressions for display as part of a logical plan. In many cases, this will produce diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index e677a333c163..0dc41d4a9ac1 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -475,16 +475,14 @@ mod test { let expr = rewrite_preserving_name(expr_from.clone(), &mut rewriter).unwrap(); let original_name = match &expr_from { - Expr::Sort(Sort { expr, .. }) => expr.schema_name(), - expr => expr.schema_name(), - } - .unwrap(); + Expr::Sort(Sort { expr, .. }) => expr.schema_name().to_string(), + expr => expr.schema_name().to_string(), + }; let new_name = match &expr { - Expr::Sort(Sort { expr, .. }) => expr.schema_name(), - expr => expr.schema_name(), - } - .unwrap(); + Expr::Sort(Sort { expr, .. }) => expr.schema_name().to_string(), + expr => expr.schema_name().to_string(), + }; assert_eq!( original_name, new_name, diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 53e7eb213413..34b5909f0a5a 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -26,6 +26,7 @@ use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; +use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::interval_arithmetic::Interval; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; @@ -357,14 +358,15 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { Ok(format!("{}({})", self.name(), names.join(","))) } - /// Returns the user-defined schema name of the UDF given the arguments + /// Returns the name of the column this expression would create + /// + /// See [`Expr::schema_name`] for details fn schema_name(&self, args: &[Expr]) -> Result { - let args_name = args - .iter() - .map(|e| e.schema_name().to_string()) - .collect::>(); - // TODO: join with ", " to standardize the formatting of Vec, - Ok(format!("{}({})", self.name(), args_name.join(","))) + Ok(format!( + "{}({})", + self.name(), + schema_name_from_exprs_comma_seperated_without_space(args)? + )) } /// Returns the function's [`Signature`] for information about what input diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index af507499d83e..c3e4505ed19c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -798,7 +798,9 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; Ok(Expr::from(Column::from((qualifier, field)))) } - _ => Ok(Expr::Column(Column::from_name(expr.schema_name().to_string()))), + _ => Ok(Expr::Column(Column::from_name( + expr.schema_name().to_string(), + ))), } } diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index fd2077921b58..16b4e43abcd5 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -452,7 +452,8 @@ fn agg_exprs_evaluation_result_on_empty_batch( let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { - expr_result_map_for_count_bug.insert(e.schema_name().to_string(), result_expr); + expr_result_map_for_count_bug + .insert(e.schema_name().to_string(), result_expr); } } Ok(()) @@ -547,8 +548,7 @@ fn filter_exprs_evaluation_result_on_empty_batch( else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))), }); let expr_key = new_expr.schema_name().to_string(); - expr_result_map_for_count_bug - .insert(expr_key, new_expr); + expr_result_map_for_count_bug.insert(expr_key, new_expr); } None } From 58957e4c7b4367b18c49746352ab9161f4f87b51 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 7 Aug 2024 17:50:07 +0800 Subject: [PATCH 31/32] clippy Signed-off-by: jayzhan211 --- datafusion/expr/src/expr.rs | 2 +- datafusion/proto/tests/cases/serialize.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 8e1e41d16413..5030a95d3c8a 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1010,7 +1010,7 @@ impl Expr { /// ``` /// /// [`Schema`]: arrow::datatypes::Schema - pub fn schema_name<'a>(&'a self) -> impl Display + 'a { + pub fn schema_name(&self) -> impl Display + '_ { SchemaDisplay(self) } diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs index 7fda9a2550df..f28098d83b97 100644 --- a/datafusion/proto/tests/cases/serialize.rs +++ b/datafusion/proto/tests/cases/serialize.rs @@ -276,7 +276,7 @@ fn test_expression_serialization_roundtrip() { /// Extracts the first part of a function name /// 'foo(bar)' -> 'foo' fn extract_function_name(expr: &Expr) -> String { - let name = expr.schema_name().unwrap(); + let name = expr.schema_name().to_string(); name.split('(').next().unwrap().to_string() } } From e89e10bd53a69f01eddefa0de068ea8f8d0badd3 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 7 Aug 2024 19:12:39 +0800 Subject: [PATCH 32/32] fix doc Signed-off-by: jayzhan211 --- datafusion/functions-nested/src/expr_ext.rs | 4 ++-- datafusion/functions/src/core/expr_ext.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-nested/src/expr_ext.rs b/datafusion/functions-nested/src/expr_ext.rs index 0647b187f5ab..4da4a3f583b7 100644 --- a/datafusion/functions-nested/src/expr_ext.rs +++ b/datafusion/functions-nested/src/expr_ext.rs @@ -38,7 +38,7 @@ use crate::extract::{array_element, array_slice}; /// # use datafusion_functions_nested::expr_ext::IndexAccessor; /// let expr = col("c1") /// .index(lit(3)); -/// assert_eq!(expr.schema_name().unwrap(), "c1[Int32(3)]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[Int32(3)]"); /// ``` pub trait IndexAccessor { fn index(self, key: Expr) -> Expr; @@ -68,7 +68,7 @@ impl IndexAccessor for Expr { /// # use datafusion_functions_nested::expr_ext::SliceAccessor; /// let expr = col("c1") /// .range(lit(2), lit(4)); -/// assert_eq!(expr.schema_name().unwrap(), "c1[Int32(2):Int32(4)]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[Int32(2):Int32(4)]"); /// ``` pub trait SliceAccessor { fn range(self, start: Expr, stop: Expr) -> Expr; diff --git a/datafusion/functions/src/core/expr_ext.rs b/datafusion/functions/src/core/expr_ext.rs index 226f873bb0b5..af05f447f1c1 100644 --- a/datafusion/functions/src/core/expr_ext.rs +++ b/datafusion/functions/src/core/expr_ext.rs @@ -41,7 +41,7 @@ use super::expr_fn::get_field; /// # use datafusion_functions::core::expr_ext::FieldAccessor; /// let expr = col("c1") /// .field("my_field"); -/// assert_eq!(expr.schema_name().unwrap(), "c1[my_field]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[my_field]"); /// ``` pub trait FieldAccessor { fn field(self, name: impl Literal) -> Expr;