diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 4a248de101dc..bc2a94cd44ff 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -102,10 +102,10 @@ impl CheckColumnsSatisfyExprsPurpose { fn message_prefix(&self) -> &'static str { match self { CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => { - "Projection references non-aggregate values" + "Column in SELECT must be in GROUP BY or an aggregate function" } CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => { - "HAVING clause references non-aggregate values" + "Column in HAVING must be in GROUP BY or an aggregate function" } } } @@ -159,7 +159,7 @@ fn check_column_satisfies_expr( ) -> Result<()> { if !columns.contains(expr) { return plan_err!( - "{}: Expression {} could not be resolved from available columns: {}", + "{}: While expanding wildcard, column \"{}\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"{}\" appears in the SELECT clause satisfies this requirement", purpose.message_prefix(), expr, expr_vec_fmt!(columns) @@ -169,7 +169,7 @@ fn check_column_satisfies_expr( purpose.diagnostic_message(expr), expr.spans().and_then(|spans| spans.first()), ) - .with_help(format!("add '{expr}' to GROUP BY clause"), None); + .with_help(format!("Either add '{expr}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({expr})"), None); err.with_diagnostic(diagnostic) }); } @@ -496,30 +496,30 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> { /// /// For example an expr of **unnest(unnest(column1)) + unnest(unnest(unnest(column2)))** /// ```text - /// ┌──────────────────┐ - /// │ binaryexpr │ - /// │ │ - /// └──────────────────┘ - /// f_down / / │ │ - /// / / f_up │ │ - /// / / f_down│ │f_up - /// unnest │ │ - /// │ │ - /// f_down / / f_up(rewriting) │ │ - /// / / - /// / / unnest - /// unnest - /// f_down / / f_up(rewriting) - /// f_down / /f_up / / - /// / / / / - /// / / unnest - /// column1 - /// f_down / /f_up - /// / / - /// / / - /// column2 + /// ┌──────────────────┐ + /// │ binaryexpr │ + /// │ │ + /// └──────────────────┘ + /// f_down / / │ │ + /// / / f_up │ │ + /// / / f_down│ │f_up + /// unnest │ │ + /// │ │ + /// f_down / / f_up(rewriting) │ │ + /// / / + /// / / unnest + /// unnest + /// f_down / / f_up(rewriting) + /// f_down / /f_up / / + /// / / / / + /// / / unnest + /// column1 + /// f_down / /f_up + /// / / + /// / / + /// column2 /// ``` - /// + /// fn f_up(&mut self, expr: Expr) -> Result> { if let Expr::Unnest(ref traversing_unnest) = expr { if traversing_unnest == self.top_most_unnest.as_ref().unwrap() { diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index 5481f046e70a..b866d220d549 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -190,7 +190,7 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diag.helps[0].message, - "add 'person.first_name' to GROUP BY clause" + "Either add 'person.first_name' to GROUP BY clause, or use an aggregare function like ANY_VALUE(person.first_name)" ); Ok(()) } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 5717f9943e59..2939e965cd6e 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -820,7 +820,7 @@ fn select_with_having_refers_to_invalid_column() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: person.id, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.id, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -844,7 +844,7 @@ fn select_with_having_with_aggregate_not_in_select() { HAVING MAX(age) > 100"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.first_name could not be resolved from available columns: max(person.age)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -880,7 +880,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: count(*)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"count(*)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1001,7 +1001,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by HAVING MAX(age) > 10 AND last_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.last_name could not be resolved from available columns: person.first_name, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.last_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.first_name, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1365,7 +1365,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res let sql = "SELECT ((age + 1) / 2) * (age + 9), MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1375,7 +1375,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel let sql = "SELECT age, MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1636,7 +1636,7 @@ fn select_7480_2() { let sql = "SELECT c1, c13, MIN(c12) FROM aggregate_test_100 GROUP BY c1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression aggregate_test_100.c13 could not be resolved from available columns: aggregate_test_100.c1, min(aggregate_test_100.c12)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"aggregate_test_100.c13\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"aggregate_test_100.c1, min(aggregate_test_100.c12)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 0cc8045dccd0..75baba3efc4f 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3468,7 +3468,7 @@ SELECT r.sn, SUM(l.amount), r.amount # to associate it with other fields, aggregate should contain all the composite columns # if any of the composite column is missing, we cannot use associated indices, inside select expression # below query should fail -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(l\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(l.amount), r.amount FROM sales_global_with_composite_pk AS l JOIN sales_global_with_composite_pk AS r @@ -3496,7 +3496,7 @@ NULL NULL NULL # left join shouldn't propagate right side constraint, # if right side is a unique key (unique and can contain null) # Please note that, above query and this one is same except the constraint in the table. -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(r\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, r.amount, SUM(r.amount) FROM (SELECT * FROM sales_global_with_unique as l @@ -3542,7 +3542,7 @@ SELECT column1, COUNT(*) as column2 FROM (VALUES (['a', 'b'], 1), (['c', 'd', 'e # primary key should be aware from which columns it is associated -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.sn could not be resolved from available columns: l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.sn" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT l.sn, r.sn, SUM(l.amount), r.amount FROM sales_global_with_pk AS l JOIN sales_global_with_pk AS r @@ -3633,7 +3633,7 @@ ORDER BY r.sn 4 100 2022-01-03T10:00:00 # after join, new window expressions shouldn't be associated with primary keys -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression rn1 could not be resolved from available columns: r\.sn, r\.ts, r\.amount, sum\(r\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "rn1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, r\.ts, r\.amount, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(r.amount), rn1 FROM (SELECT r.ts, r.sn, r.amount, @@ -5135,7 +5135,7 @@ statement ok CREATE TABLE test_case_expr(a INT, b TEXT) AS VALUES (1,'hello'), (2,'world') query T -SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c +SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c FROM test_case_expr GROUP BY c; ---- bad diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 9c46410c4909..b9c13582952a 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -32,14 +32,14 @@ AS VALUES statement ok CREATE TABLE nested_unnest_table -AS VALUES +AS VALUES (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50])), null) ; statement ok CREATE TABLE recursive_unnest_table -AS VALUES +AS VALUES (struct([1], 'a'), [[[1],[2]],[[1,1]]], [struct([1],[[1,2]])]), (struct([2], 'b'), [[[3,4],[5]],[[null,6],null,[7,8]]], [struct([2],[[3],[4]])]) ; @@ -264,9 +264,9 @@ NULL NULL 17 NULL NULL 18 query IIIT -select - unnest(column1), unnest(column2) + 2, - column3 * 10, unnest(array_remove(column1, '4')) +select + unnest(column1), unnest(column2) + 2, + column3 * 10, unnest(array_remove(column1, '4')) from unnest_table; ---- 1 9 10 1 @@ -795,7 +795,7 @@ select unnest(unnest(column2)) c2, count(column3) from recursive_unnest_table gr [NULL, 6] 1 NULL 1 -query error DataFusion error: Error during planning: Projection references non\-aggregate values +query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "nested_unnest_table\.column1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "UNNEST\(nested_unnest_table\.column1\)\[c0\]" appears in the SELECT clause satisfies this requirement select unnest(column1) c1 from nested_unnest_table group by c1.c0; # TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794 @@ -875,7 +875,7 @@ query TT explain select * from unnest_table u, unnest(u.column1); ---- logical_plan -01)Cross Join: +01)Cross Join: 02)--SubqueryAlias: u 03)----TableScan: unnest_table projection=[column1, column2, column3, column4, column5] 04)--Subquery: