From bfdc8660d5ba06b16dfc586d091ea344479815eb Mon Sep 17 00:00:00 2001 From: Jiashu-Hu Date: Sun, 16 Mar 2025 20:20:42 -0500 Subject: [PATCH 1/4] Improve groupby error message for new version of expand wildcard logic, also modified related CICL content --- datafusion/common/src/error.rs | 19 ++++++++++-- datafusion/sql/src/select.rs | 34 +++++++++++++++++++++ datafusion/sql/src/utils.rs | 54 +++++++++++++++++----------------- 3 files changed, 78 insertions(+), 29 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index c50ec64759d5..5808784405a4 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -164,19 +164,27 @@ macro_rules! context { #[derive(Debug)] pub enum SchemaError { /// Schema contains a (possibly) qualified and unqualified field with same unqualified name - AmbiguousReference { field: Column }, + AmbiguousReference { + field: Column, + }, /// Schema contains duplicate qualified field name DuplicateQualifiedField { qualifier: Box, name: String, }, /// Schema contains duplicate unqualified field name - DuplicateUnqualifiedField { name: String }, + DuplicateUnqualifiedField { + name: String, + }, /// No field with this name FieldNotFound { field: Box, valid_fields: Vec, }, + // Group by column does contains all the columns in the select list + GoupByColumnNotInSelectList { + column: String, + }, } impl Display for SchemaError { @@ -256,6 +264,13 @@ impl Display for SchemaError { ) } } + Self::GoupByColumnNotInSelectList { column } => { + write!( + f, + "While expanding wildcard, column ’{}‘ must appear in the GROUP BY clause or must be part of an aggregate function.", + column + ) + } } } } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index e9cf4ce48a77..968aef75bab9 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -27,6 +27,7 @@ use crate::utils::{ use datafusion_common::error::DataFusionErrorBuilder; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::DataFusionError; use datafusion_common::{not_impl_err, plan_err, Result}; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; @@ -826,6 +827,13 @@ impl SqlToRel<'_, S> { .map(|expr| rebase_expr(expr, &aggr_projection_exprs, input)) .collect::>>()?; + // check if the columns in the SELECT list are in the GROUP BY clause + // or are part of an aggregate function, if not, throw an error. + validate_columns_in_group_by_or_aggregate( + &select_exprs_post_aggr, + &column_exprs_post_aggr, + )?; + // finally, we have some validation that the re-written projection can be resolved // from the aggregate output columns check_columns_satisfy_exprs( @@ -855,6 +863,32 @@ impl SqlToRel<'_, S> { } } +/// This function is for checking if the columns in the SELECT list are +/// in the GROUP BY clause or are part of an aggregate function. +/// If not, throw an SchemaError::GroupByColumnInvalid and return an error. +/// +fn validate_columns_in_group_by_or_aggregate( + expanded: &Vec, + aggregate_columns: &Vec, +) -> Result<(), DataFusionError> { + if !aggregate_columns.is_empty() { + for e in expanded { + if let Expr::Column(col) = e { + let name = &col.name; + if !aggregate_columns.contains(e) { + return Err(DataFusionError::SchemaError( + datafusion_common::SchemaError::GoupByColumnNotInSelectList { + column: (name.clone().to_string()), + }, + Box::new(None), + )); + } + } + } + } + Ok(()) +} + // If there are any multiple-defined windows, we raise an error. fn check_conflicting_windows(window_defs: &[NamedWindowDefinition]) -> Result<()> { for (i, window_def_i) in window_defs.iter().enumerate() { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 4a248de101dc..a0e54393af8a 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -102,10 +102,10 @@ impl CheckColumnsSatisfyExprsPurpose { fn message_prefix(&self) -> &'static str { match self { CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => { - "Projection references non-aggregate values" + "Column in SELECT must be in GROUP BY or an aggregate function" } CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => { - "HAVING clause references non-aggregate values" + "Column in HAVING must be in GROUP BY or an aggregate function" } } } @@ -159,7 +159,7 @@ fn check_column_satisfies_expr( ) -> Result<()> { if !columns.contains(expr) { return plan_err!( - "{}: Expression {} could not be resolved from available columns: {}", + "{}: Expression {} could not be resolved from available columns: {}, it must be in GROUP BY or an aggregate function", purpose.message_prefix(), expr, expr_vec_fmt!(columns) @@ -169,7 +169,7 @@ fn check_column_satisfies_expr( purpose.diagnostic_message(expr), expr.spans().and_then(|spans| spans.first()), ) - .with_help(format!("add '{expr}' to GROUP BY clause"), None); + .with_help(format!("Either add '{expr}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({expr})"), None); err.with_diagnostic(diagnostic) }); } @@ -496,30 +496,30 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> { /// /// For example an expr of **unnest(unnest(column1)) + unnest(unnest(unnest(column2)))** /// ```text - /// ┌──────────────────┐ - /// │ binaryexpr │ - /// │ │ - /// └──────────────────┘ - /// f_down / / │ │ - /// / / f_up │ │ - /// / / f_down│ │f_up - /// unnest │ │ - /// │ │ - /// f_down / / f_up(rewriting) │ │ - /// / / - /// / / unnest - /// unnest - /// f_down / / f_up(rewriting) - /// f_down / /f_up / / - /// / / / / - /// / / unnest - /// column1 - /// f_down / /f_up - /// / / - /// / / - /// column2 + /// ┌──────────────────┐ + /// │ binaryexpr │ + /// │ │ + /// └──────────────────┘ + /// f_down / / │ │ + /// / / f_up │ │ + /// / / f_down│ │f_up + /// unnest │ │ + /// │ │ + /// f_down / / f_up(rewriting) │ │ + /// / / + /// / / unnest + /// unnest + /// f_down / / f_up(rewriting) + /// f_down / /f_up / / + /// / / / / + /// / / unnest + /// column1 + /// f_down / /f_up + /// / / + /// / / + /// column2 /// ``` - /// + /// fn f_up(&mut self, expr: Expr) -> Result> { if let Expr::Unnest(ref traversing_unnest) = expr { if traversing_unnest == self.top_most_unnest.as_ref().unwrap() { From c24c3b5afe08a0b8e6f6c9608bfb3f35d6f6d0bb Mon Sep 17 00:00:00 2001 From: Jiashu-Hu Date: Mon, 17 Mar 2025 21:08:40 -0500 Subject: [PATCH 2/4] Improved error message for new expand wildcard logic, also editted related CICL test --- datafusion/common/src/error.rs | 4 +-- datafusion/sql/src/select.rs | 34 +++++++++++++++---- datafusion/sql/tests/cases/diagnostic.rs | 4 +-- datafusion/sql/tests/sql_integration.rs | 14 ++++---- .../sqllogictest/test_files/group_by.slt | 10 +++--- datafusion/sqllogictest/test_files/unnest.slt | 14 ++++---- 6 files changed, 50 insertions(+), 30 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 5808784405a4..8f42f60cf123 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -182,7 +182,7 @@ pub enum SchemaError { valid_fields: Vec, }, // Group by column does contains all the columns in the select list - GoupByColumnNotInSelectList { + GroupByColumnNotInSelectList { column: String, }, } @@ -264,7 +264,7 @@ impl Display for SchemaError { ) } } - Self::GoupByColumnNotInSelectList { column } => { + Self::GroupByColumnNotInSelectList { column } => { write!( f, "While expanding wildcard, column ’{}‘ must appear in the GROUP BY clause or must be part of an aggregate function.", diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 968aef75bab9..e9ab23b3492c 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -25,10 +25,10 @@ use crate::utils::{ CheckColumnsSatisfyExprsPurpose, }; -use datafusion_common::error::DataFusionErrorBuilder; +use datafusion_common::error::{DataFusionError, DataFusionErrorBuilder}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::DataFusionError; -use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_common::SchemaError::GroupByColumnNotInSelectList; +use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result}; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ @@ -874,14 +874,34 @@ fn validate_columns_in_group_by_or_aggregate( if !aggregate_columns.is_empty() { for e in expanded { if let Expr::Column(col) = e { + let mut table_name = String::new(); + if let Some(relation) = &col.relation { + table_name = relation.to_string(); + } let name = &col.name; if !aggregate_columns.contains(e) { + if table_name.is_empty() { + let mut diagnostic = + Diagnostic::new_error(format!("Column '{}' must appear in the GROUP BY clause or be used in an aggregate function", &name), col.spans().first()); + diagnostic.add_help(format!("Either add '{}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({})", &name, &name), None); + return Err(DataFusionError::SchemaError( + GroupByColumnNotInSelectList { + column: name.clone(), + }, + Box::new(Some(DataFusionError::get_back_trace())), + ) + .with_diagnostic(diagnostic)); + } + let mut diagnostic = + Diagnostic::new_error(format!("Column '{}.{}' must appear in the GROUP BY clause or be used in an aggregate function", &table_name, &name), col.spans().first()); + diagnostic.add_help(format!("Either add '{}.{}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({}.{})", &table_name, &name, &table_name, &name), None); return Err(DataFusionError::SchemaError( - datafusion_common::SchemaError::GoupByColumnNotInSelectList { - column: (name.clone().to_string()), + GroupByColumnNotInSelectList { + column: (format!("{}.{}", &table_name, &name)), }, - Box::new(None), - )); + Box::new(Some(DataFusionError::get_back_trace())), + ) + .with_diagnostic(diagnostic)); } } } diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index 5481f046e70a..4daa78874023 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -185,12 +185,12 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { let diag = do_query(query); assert_eq!( diag.message, - "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" + "Column 'person.first_name' must appear in the GROUP BY clause or be used in an aggregate function" ); assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diag.helps[0].message, - "add 'person.first_name' to GROUP BY clause" + "Either add 'person.first_name' to GROUP BY clause, or use an aggregare function like ANY_VALUE(person.first_name)" ); Ok(()) } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 5717f9943e59..9ca07507413f 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -820,7 +820,7 @@ fn select_with_having_refers_to_invalid_column() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: person.id, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.first_name could not be resolved from available columns: person.id, max(person.age), it must be in GROUP BY or an aggregate function", err.strip_backtrace() ); } @@ -844,7 +844,7 @@ fn select_with_having_with_aggregate_not_in_select() { HAVING MAX(age) > 100"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.first_name could not be resolved from available columns: max(person.age)", + "Schema error: While expanding wildcard, column ’person.first_name‘ must appear in the GROUP BY clause or must be part of an aggregate function.", err.strip_backtrace() ); } @@ -880,7 +880,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: count(*)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.first_name could not be resolved from available columns: count(*), it must be in GROUP BY or an aggregate function", err.strip_backtrace() ); } @@ -1001,7 +1001,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by HAVING MAX(age) > 10 AND last_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.last_name could not be resolved from available columns: person.first_name, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.last_name could not be resolved from available columns: person.first_name, max(person.age), it must be in GROUP BY or an aggregate function", err.strip_backtrace() ); } @@ -1365,7 +1365,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res let sql = "SELECT ((age + 1) / 2) * (age + 9), MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name), it must be in GROUP BY or an aggregate function", err.strip_backtrace() ); } @@ -1375,7 +1375,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel let sql = "SELECT age, MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Schema error: While expanding wildcard, column ’person.age‘ must appear in the GROUP BY clause or must be part of an aggregate function.", err.strip_backtrace() ); } @@ -1636,7 +1636,7 @@ fn select_7480_2() { let sql = "SELECT c1, c13, MIN(c12) FROM aggregate_test_100 GROUP BY c1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression aggregate_test_100.c13 could not be resolved from available columns: aggregate_test_100.c1, min(aggregate_test_100.c12)", + "Schema error: While expanding wildcard, column ’aggregate_test_100.c13‘ must appear in the GROUP BY clause or must be part of an aggregate function.", err.strip_backtrace() ); } diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 0cc8045dccd0..b4f6c9575c81 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3468,7 +3468,7 @@ SELECT r.sn, SUM(l.amount), r.amount # to associate it with other fields, aggregate should contain all the composite columns # if any of the composite column is missing, we cannot use associated indices, inside select expression # below query should fail -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(l\.amount\) +statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.amount‘ must appear in the GROUP BY clause or must be part of an aggregate function\. SELECT r.sn, SUM(l.amount), r.amount FROM sales_global_with_composite_pk AS l JOIN sales_global_with_composite_pk AS r @@ -3496,7 +3496,7 @@ NULL NULL NULL # left join shouldn't propagate right side constraint, # if right side is a unique key (unique and can contain null) # Please note that, above query and this one is same except the constraint in the table. -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(r\.amount\) +statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.amount‘ must appear in the GROUP BY clause or must be part of an aggregate function\. SELECT r.sn, r.amount, SUM(r.amount) FROM (SELECT * FROM sales_global_with_unique as l @@ -3542,7 +3542,7 @@ SELECT column1, COUNT(*) as column2 FROM (VALUES (['a', 'b'], 1), (['c', 'd', 'e # primary key should be aware from which columns it is associated -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.sn could not be resolved from available columns: l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\) +statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.sn‘ must appear in the GROUP BY clause or must be part of an aggregate function. SELECT l.sn, r.sn, SUM(l.amount), r.amount FROM sales_global_with_pk AS l JOIN sales_global_with_pk AS r @@ -3633,7 +3633,7 @@ ORDER BY r.sn 4 100 2022-01-03T10:00:00 # after join, new window expressions shouldn't be associated with primary keys -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression rn1 could not be resolved from available columns: r\.sn, r\.ts, r\.amount, sum\(r\.amount\) +statement error DataFusion error: Schema error: While expanding wildcard, column ’rn1‘ must appear in the GROUP BY clause or must be part of an aggregate function. SELECT r.sn, SUM(r.amount), rn1 FROM (SELECT r.ts, r.sn, r.amount, @@ -5135,7 +5135,7 @@ statement ok CREATE TABLE test_case_expr(a INT, b TEXT) AS VALUES (1,'hello'), (2,'world') query T -SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c +SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c FROM test_case_expr GROUP BY c; ---- bad diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 9c46410c4909..0aaa00eb5ae1 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -32,14 +32,14 @@ AS VALUES statement ok CREATE TABLE nested_unnest_table -AS VALUES +AS VALUES (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50])), null) ; statement ok CREATE TABLE recursive_unnest_table -AS VALUES +AS VALUES (struct([1], 'a'), [[[1],[2]],[[1,1]]], [struct([1],[[1,2]])]), (struct([2], 'b'), [[[3,4],[5]],[[null,6],null,[7,8]]], [struct([2],[[3],[4]])]) ; @@ -264,9 +264,9 @@ NULL NULL 17 NULL NULL 18 query IIIT -select - unnest(column1), unnest(column2) + 2, - column3 * 10, unnest(array_remove(column1, '4')) +select + unnest(column1), unnest(column2) + 2, + column3 * 10, unnest(array_remove(column1, '4')) from unnest_table; ---- 1 9 10 1 @@ -795,7 +795,7 @@ select unnest(unnest(column2)) c2, count(column3) from recursive_unnest_table gr [NULL, 6] 1 NULL 1 -query error DataFusion error: Error during planning: Projection references non\-aggregate values +query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: Expression nested_unnest_table\.column1 could not be resolved from available columns: UNNEST\(nested_unnest_table\.column1\)\[c0\], it must be in GROUP BY or an aggregate function select unnest(column1) c1 from nested_unnest_table group by c1.c0; # TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794 @@ -875,7 +875,7 @@ query TT explain select * from unnest_table u, unnest(u.column1); ---- logical_plan -01)Cross Join: +01)Cross Join: 02)--SubqueryAlias: u 03)----TableScan: unnest_table projection=[column1, column2, column3, column4, column5] 04)--Subquery: From 4c7f3c8a5bff1c3cac149a0af315671f35e8bcf4 Mon Sep 17 00:00:00 2001 From: Jiashu-Hu Date: Mon, 17 Mar 2025 22:16:57 -0500 Subject: [PATCH 3/4] improved syntax requested by CICL process --- datafusion/sql/src/select.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index e9ab23b3492c..cabba0745f7b 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -869,7 +869,7 @@ impl SqlToRel<'_, S> { /// fn validate_columns_in_group_by_or_aggregate( expanded: &Vec, - aggregate_columns: &Vec, + aggregate_columns: &[Expr], ) -> Result<(), DataFusionError> { if !aggregate_columns.is_empty() { for e in expanded { From 42227bdf033aa658f71d8e2592e4eb4b3156f573 Mon Sep 17 00:00:00 2001 From: Jiashu-Hu Date: Tue, 18 Mar 2025 12:50:38 -0500 Subject: [PATCH 4/4] unified and , updated test error message --- datafusion/common/src/error.rs | 19 +----- datafusion/sql/src/select.rs | 58 +------------------ datafusion/sql/src/utils.rs | 2 +- datafusion/sql/tests/cases/diagnostic.rs | 2 +- datafusion/sql/tests/sql_integration.rs | 14 ++--- .../sqllogictest/test_files/group_by.slt | 8 +-- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 7 files changed, 18 insertions(+), 87 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 8f42f60cf123..c50ec64759d5 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -164,27 +164,19 @@ macro_rules! context { #[derive(Debug)] pub enum SchemaError { /// Schema contains a (possibly) qualified and unqualified field with same unqualified name - AmbiguousReference { - field: Column, - }, + AmbiguousReference { field: Column }, /// Schema contains duplicate qualified field name DuplicateQualifiedField { qualifier: Box, name: String, }, /// Schema contains duplicate unqualified field name - DuplicateUnqualifiedField { - name: String, - }, + DuplicateUnqualifiedField { name: String }, /// No field with this name FieldNotFound { field: Box, valid_fields: Vec, }, - // Group by column does contains all the columns in the select list - GroupByColumnNotInSelectList { - column: String, - }, } impl Display for SchemaError { @@ -264,13 +256,6 @@ impl Display for SchemaError { ) } } - Self::GroupByColumnNotInSelectList { column } => { - write!( - f, - "While expanding wildcard, column ’{}‘ must appear in the GROUP BY clause or must be part of an aggregate function.", - column - ) - } } } } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index cabba0745f7b..e9cf4ce48a77 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -25,10 +25,9 @@ use crate::utils::{ CheckColumnsSatisfyExprsPurpose, }; -use datafusion_common::error::{DataFusionError, DataFusionErrorBuilder}; +use datafusion_common::error::DataFusionErrorBuilder; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::SchemaError::GroupByColumnNotInSelectList; -use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result}; +use datafusion_common::{not_impl_err, plan_err, Result}; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ @@ -827,13 +826,6 @@ impl SqlToRel<'_, S> { .map(|expr| rebase_expr(expr, &aggr_projection_exprs, input)) .collect::>>()?; - // check if the columns in the SELECT list are in the GROUP BY clause - // or are part of an aggregate function, if not, throw an error. - validate_columns_in_group_by_or_aggregate( - &select_exprs_post_aggr, - &column_exprs_post_aggr, - )?; - // finally, we have some validation that the re-written projection can be resolved // from the aggregate output columns check_columns_satisfy_exprs( @@ -863,52 +855,6 @@ impl SqlToRel<'_, S> { } } -/// This function is for checking if the columns in the SELECT list are -/// in the GROUP BY clause or are part of an aggregate function. -/// If not, throw an SchemaError::GroupByColumnInvalid and return an error. -/// -fn validate_columns_in_group_by_or_aggregate( - expanded: &Vec, - aggregate_columns: &[Expr], -) -> Result<(), DataFusionError> { - if !aggregate_columns.is_empty() { - for e in expanded { - if let Expr::Column(col) = e { - let mut table_name = String::new(); - if let Some(relation) = &col.relation { - table_name = relation.to_string(); - } - let name = &col.name; - if !aggregate_columns.contains(e) { - if table_name.is_empty() { - let mut diagnostic = - Diagnostic::new_error(format!("Column '{}' must appear in the GROUP BY clause or be used in an aggregate function", &name), col.spans().first()); - diagnostic.add_help(format!("Either add '{}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({})", &name, &name), None); - return Err(DataFusionError::SchemaError( - GroupByColumnNotInSelectList { - column: name.clone(), - }, - Box::new(Some(DataFusionError::get_back_trace())), - ) - .with_diagnostic(diagnostic)); - } - let mut diagnostic = - Diagnostic::new_error(format!("Column '{}.{}' must appear in the GROUP BY clause or be used in an aggregate function", &table_name, &name), col.spans().first()); - diagnostic.add_help(format!("Either add '{}.{}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({}.{})", &table_name, &name, &table_name, &name), None); - return Err(DataFusionError::SchemaError( - GroupByColumnNotInSelectList { - column: (format!("{}.{}", &table_name, &name)), - }, - Box::new(Some(DataFusionError::get_back_trace())), - ) - .with_diagnostic(diagnostic)); - } - } - } - } - Ok(()) -} - // If there are any multiple-defined windows, we raise an error. fn check_conflicting_windows(window_defs: &[NamedWindowDefinition]) -> Result<()> { for (i, window_def_i) in window_defs.iter().enumerate() { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index a0e54393af8a..bc2a94cd44ff 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -159,7 +159,7 @@ fn check_column_satisfies_expr( ) -> Result<()> { if !columns.contains(expr) { return plan_err!( - "{}: Expression {} could not be resolved from available columns: {}, it must be in GROUP BY or an aggregate function", + "{}: While expanding wildcard, column \"{}\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"{}\" appears in the SELECT clause satisfies this requirement", purpose.message_prefix(), expr, expr_vec_fmt!(columns) diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index 4daa78874023..b866d220d549 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -185,7 +185,7 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { let diag = do_query(query); assert_eq!( diag.message, - "Column 'person.first_name' must appear in the GROUP BY clause or be used in an aggregate function" + "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" ); assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 9ca07507413f..2939e965cd6e 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -820,7 +820,7 @@ fn select_with_having_refers_to_invalid_column() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.first_name could not be resolved from available columns: person.id, max(person.age), it must be in GROUP BY or an aggregate function", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.id, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -844,7 +844,7 @@ fn select_with_having_with_aggregate_not_in_select() { HAVING MAX(age) > 100"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Schema error: While expanding wildcard, column ’person.first_name‘ must appear in the GROUP BY clause or must be part of an aggregate function.", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -880,7 +880,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.first_name could not be resolved from available columns: count(*), it must be in GROUP BY or an aggregate function", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"count(*)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1001,7 +1001,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by HAVING MAX(age) > 10 AND last_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: Expression person.last_name could not be resolved from available columns: person.first_name, max(person.age), it must be in GROUP BY or an aggregate function", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.last_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.first_name, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1365,7 +1365,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res let sql = "SELECT ((age + 1) / 2) * (age + 9), MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name), it must be in GROUP BY or an aggregate function", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1375,7 +1375,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel let sql = "SELECT age, MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Schema error: While expanding wildcard, column ’person.age‘ must appear in the GROUP BY clause or must be part of an aggregate function.", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1636,7 +1636,7 @@ fn select_7480_2() { let sql = "SELECT c1, c13, MIN(c12) FROM aggregate_test_100 GROUP BY c1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Schema error: While expanding wildcard, column ’aggregate_test_100.c13‘ must appear in the GROUP BY clause or must be part of an aggregate function.", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"aggregate_test_100.c13\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"aggregate_test_100.c1, min(aggregate_test_100.c12)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index b4f6c9575c81..75baba3efc4f 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3468,7 +3468,7 @@ SELECT r.sn, SUM(l.amount), r.amount # to associate it with other fields, aggregate should contain all the composite columns # if any of the composite column is missing, we cannot use associated indices, inside select expression # below query should fail -statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.amount‘ must appear in the GROUP BY clause or must be part of an aggregate function\. +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(l.amount), r.amount FROM sales_global_with_composite_pk AS l JOIN sales_global_with_composite_pk AS r @@ -3496,7 +3496,7 @@ NULL NULL NULL # left join shouldn't propagate right side constraint, # if right side is a unique key (unique and can contain null) # Please note that, above query and this one is same except the constraint in the table. -statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.amount‘ must appear in the GROUP BY clause or must be part of an aggregate function\. +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, r.amount, SUM(r.amount) FROM (SELECT * FROM sales_global_with_unique as l @@ -3542,7 +3542,7 @@ SELECT column1, COUNT(*) as column2 FROM (VALUES (['a', 'b'], 1), (['c', 'd', 'e # primary key should be aware from which columns it is associated -statement error DataFusion error: Schema error: While expanding wildcard, column ’r\.sn‘ must appear in the GROUP BY clause or must be part of an aggregate function. +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.sn" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT l.sn, r.sn, SUM(l.amount), r.amount FROM sales_global_with_pk AS l JOIN sales_global_with_pk AS r @@ -3633,7 +3633,7 @@ ORDER BY r.sn 4 100 2022-01-03T10:00:00 # after join, new window expressions shouldn't be associated with primary keys -statement error DataFusion error: Schema error: While expanding wildcard, column ’rn1‘ must appear in the GROUP BY clause or must be part of an aggregate function. +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "rn1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, r\.ts, r\.amount, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(r.amount), rn1 FROM (SELECT r.ts, r.sn, r.amount, diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 0aaa00eb5ae1..b9c13582952a 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -795,7 +795,7 @@ select unnest(unnest(column2)) c2, count(column3) from recursive_unnest_table gr [NULL, 6] 1 NULL 1 -query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: Expression nested_unnest_table\.column1 could not be resolved from available columns: UNNEST\(nested_unnest_table\.column1\)\[c0\], it must be in GROUP BY or an aggregate function +query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "nested_unnest_table\.column1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "UNNEST\(nested_unnest_table\.column1\)\[c0\]" appears in the SELECT clause satisfies this requirement select unnest(column1) c1 from nested_unnest_table group by c1.c0; # TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794