diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql new file mode 100755 index 0000000000000..80edf12ef8ac5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql @@ -0,0 +1,167 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- SELECT_IMPLICIT +-- Test cases for queries with ordering terms missing from the target list. +-- This used to be called "junkfilter.sql". +-- The parser uses the term "resjunk" to handle these cases. +-- - thomas 1998-07-09 +-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/select_implicit.sql +-- +-- This test file was converted from pgSQL/select_implicit.sql +-- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used +-- TODO: We should add UDFs in GROUP BY clause when [SPARK-28445] is resolved. + +-- load test data +CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet; +INSERT INTO test_missing_target VALUES (0, 1, 'XXXX', 'A'); +INSERT INTO test_missing_target VALUES (1, 2, 'ABAB', 'b'); +INSERT INTO test_missing_target VALUES (2, 2, 'ABAB', 'c'); +INSERT INTO test_missing_target VALUES (3, 3, 'BBBB', 'D'); +INSERT INTO test_missing_target VALUES (4, 3, 'BBBB', 'e'); +INSERT INTO test_missing_target VALUES (5, 3, 'bbbb', 'F'); +INSERT INTO test_missing_target VALUES (6, 4, 'cccc', 'g'); +INSERT INTO test_missing_target VALUES (7, 4, 'cccc', 'h'); +INSERT INTO test_missing_target VALUES (8, 4, 'CCCC', 'I'); +INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j'); + + +-- w/ existing GROUP BY target +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY +test_missing_target.c +ORDER BY udf(c); + +-- w/o existing GROUP BY target using a relation name in GROUP BY clause +SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c +ORDER BY udf(c); + +-- w/o existing GROUP BY target and w/o existing a different ORDER BY target +-- failure expected +SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b); + +-- w/o existing GROUP BY target and w/o existing same ORDER BY target +SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b); + +-- w/ existing GROUP BY target using a relation name in target +SELECT udf(test_missing_target.b), udf(count(*)) + FROM test_missing_target GROUP BY b ORDER BY udf(b); + +-- w/o existing GROUP BY target +SELECT udf(c) FROM test_missing_target ORDER BY udf(a); + +-- w/o existing ORDER BY target +SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc; + +-- group using reference number +SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc; + +-- order using reference number +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 1 ORDER BY 1; + +-- group using reference number out of range +-- failure expected +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3; + +-- group w/o existing GROUP BY and ORDER BY target under ambiguous condition +-- failure expected +SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY b ORDER BY udf(b); + +-- order w/ target under ambiguous condition +-- failure NOT expected +SELECT udf(a), udf(a) FROM test_missing_target + ORDER BY udf(a); + +-- order expression w/ target under ambiguous condition +-- failure NOT expected +SELECT udf(udf(a)/2), udf(udf(a)/2) FROM test_missing_target + ORDER BY udf(udf(a)/2); + +-- group expression w/ target under ambiguous condition +-- failure NOT expected +SELECT udf(a/2), udf(a/2) FROM test_missing_target + GROUP BY a/2 ORDER BY udf(a/2); + +-- group w/ existing GROUP BY target under ambiguous condition +SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b ORDER BY udf(x.b); + +-- group w/o existing GROUP BY target under ambiguous condition +SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b ORDER BY udf(x.b); + +-- [SPARK-28329] SELECT INTO syntax +-- group w/o existing GROUP BY target under ambiguous condition +-- into a table +-- SELECT count(*) INTO TABLE test_missing_target2 +-- FROM test_missing_target x, test_missing_target y +-- WHERE x.a = y.a +-- GROUP BY x.b ORDER BY x.b; +-- SELECT * FROM test_missing_target2; + + +-- Functions and expressions + +-- w/ existing GROUP BY target +SELECT a%2, udf(count(udf(b))) FROM test_missing_target +GROUP BY test_missing_target.a%2 +ORDER BY udf(test_missing_target.a%2); + +-- w/o existing GROUP BY target using a relation name in GROUP BY clause +SELECT udf(count(c)) FROM test_missing_target +GROUP BY lower(test_missing_target.c) +ORDER BY udf(lower(test_missing_target.c)); + +-- w/o existing GROUP BY target and w/o existing a different ORDER BY target +-- failure expected +SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b); + +-- w/o existing GROUP BY target and w/o existing same ORDER BY target +SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2); + +-- w/ existing GROUP BY target using a relation name in target +SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) + FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c)); + +-- w/o existing GROUP BY target +SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d))); + +-- w/o existing ORDER BY target +SELECT udf(count(b)) FROM test_missing_target + GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc; + +-- group w/o existing GROUP BY and ORDER BY target under ambiguous condition +-- failure expected +SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY b/2 ORDER BY udf(b/2); + +-- group w/ existing GROUP BY target under ambiguous condition +SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x, +test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b/2 ORDER BY udf(x.b/2); + +-- group w/o existing GROUP BY target under ambiguous condition +-- failure expected due to ambiguous b in count(b) +SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b/2; + +-- [SPARK-28329] SELECT INTO syntax +-- group w/o existing GROUP BY target under ambiguous condition +-- into a table +-- SELECT count(x.b) INTO TABLE test_missing_target3 +-- FROM test_missing_target x, test_missing_target y +-- WHERE x.a = y.a +-- GROUP BY x.b/2 ORDER BY x.b/2; +-- SELECT * FROM test_missing_target3; + +-- Cleanup +DROP TABLE test_missing_target; +-- DROP TABLE test_missing_target2; +-- DROP TABLE test_missing_target3; diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out new file mode 100755 index 0000000000000..e6a5995d24d25 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out @@ -0,0 +1,420 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 38 + + +-- !query 0 +CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +INSERT INTO test_missing_target VALUES (0, 1, 'XXXX', 'A') +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +INSERT INTO test_missing_target VALUES (1, 2, 'ABAB', 'b') +-- !query 2 schema +struct<> +-- !query 2 output + + + +-- !query 3 +INSERT INTO test_missing_target VALUES (2, 2, 'ABAB', 'c') +-- !query 3 schema +struct<> +-- !query 3 output + + + +-- !query 4 +INSERT INTO test_missing_target VALUES (3, 3, 'BBBB', 'D') +-- !query 4 schema +struct<> +-- !query 4 output + + + +-- !query 5 +INSERT INTO test_missing_target VALUES (4, 3, 'BBBB', 'e') +-- !query 5 schema +struct<> +-- !query 5 output + + + +-- !query 6 +INSERT INTO test_missing_target VALUES (5, 3, 'bbbb', 'F') +-- !query 6 schema +struct<> +-- !query 6 output + + + +-- !query 7 +INSERT INTO test_missing_target VALUES (6, 4, 'cccc', 'g') +-- !query 7 schema +struct<> +-- !query 7 output + + + +-- !query 8 +INSERT INTO test_missing_target VALUES (7, 4, 'cccc', 'h') +-- !query 8 schema +struct<> +-- !query 8 output + + + +-- !query 9 +INSERT INTO test_missing_target VALUES (8, 4, 'CCCC', 'I') +-- !query 9 schema +struct<> +-- !query 9 output + + + +-- !query 10 +INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j') +-- !query 10 schema +struct<> +-- !query 10 output + + + +-- !query 11 +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY +test_missing_target.c +ORDER BY udf(c) +-- !query 11 schema +struct +-- !query 11 output +ABAB 2 +BBBB 2 +CCCC 2 +XXXX 1 +bbbb 1 +cccc 2 + + +-- !query 12 +SELECT udf(count(*)) FROM test_missing_target GROUP BY test_missing_target.c +ORDER BY udf(c) +-- !query 12 schema +struct +-- !query 12 output +2 +2 +2 +1 +1 +2 + + +-- !query 13 +SELECT udf(count(*)) FROM test_missing_target GROUP BY a ORDER BY udf(b) +-- !query 13 schema +struct<> +-- !query 13 output +org.apache.spark.sql.AnalysisException +cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 70 + + +-- !query 14 +SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) +-- !query 14 schema +struct +-- !query 14 output +1 +2 +3 +4 + + +-- !query 15 +SELECT udf(test_missing_target.b), udf(count(*)) + FROM test_missing_target GROUP BY b ORDER BY udf(b) +-- !query 15 schema +struct +-- !query 15 output +1 1 +2 2 +3 3 +4 4 + + +-- !query 16 +SELECT udf(c) FROM test_missing_target ORDER BY udf(a) +-- !query 16 schema +struct +-- !query 16 output +XXXX +ABAB +ABAB +BBBB +BBBB +bbbb +cccc +cccc +CCCC +CCCC + + +-- !query 17 +SELECT udf(count(*)) FROM test_missing_target GROUP BY b ORDER BY udf(b) desc +-- !query 17 schema +struct +-- !query 17 output +4 +3 +2 +1 + + +-- !query 18 +SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc +-- !query 18 schema +struct +-- !query 18 output +10 + + +-- !query 19 +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 1 ORDER BY 1 +-- !query 19 schema +struct +-- !query 19 output +ABAB 2 +BBBB 2 +CCCC 2 +XXXX 1 +bbbb 1 +cccc 2 + + +-- !query 20 +SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3 +-- !query 20 schema +struct<> +-- !query 20 output +org.apache.spark.sql.AnalysisException +GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 63 + + +-- !query 21 +SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY b ORDER BY udf(b) +-- !query 21 schema +struct<> +-- !query 21 output +org.apache.spark.sql.AnalysisException +Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 + + +-- !query 22 +SELECT udf(a), udf(a) FROM test_missing_target + ORDER BY udf(a) +-- !query 22 schema +struct +-- !query 22 output +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 + + +-- !query 23 +SELECT udf(udf(a)/2), udf(udf(a)/2) FROM test_missing_target + ORDER BY udf(udf(a)/2) +-- !query 23 schema +struct +-- !query 23 output +0 0 +0 0 +1 1 +1 1 +2 2 +2 2 +3 3 +3 3 +4 4 +4 4 + + +-- !query 24 +SELECT udf(a/2), udf(a/2) FROM test_missing_target + GROUP BY a/2 ORDER BY udf(a/2) +-- !query 24 schema +struct +-- !query 24 output +0 0 +1 1 +2 2 +3 3 +4 4 + + +-- !query 25 +SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b ORDER BY udf(x.b) +-- !query 25 schema +struct +-- !query 25 output +1 1 +2 2 +3 3 +4 4 + + +-- !query 26 +SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b ORDER BY udf(x.b) +-- !query 26 schema +struct +-- !query 26 output +1 +2 +3 +4 + + +-- !query 27 +SELECT a%2, udf(count(udf(b))) FROM test_missing_target +GROUP BY test_missing_target.a%2 +ORDER BY udf(test_missing_target.a%2) +-- !query 27 schema +struct<(a % 2):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint> +-- !query 27 output +0 5 +1 5 + + +-- !query 28 +SELECT udf(count(c)) FROM test_missing_target +GROUP BY lower(test_missing_target.c) +ORDER BY udf(lower(test_missing_target.c)) +-- !query 28 schema +struct +-- !query 28 output +2 +3 +4 +1 + + +-- !query 29 +SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY a ORDER BY udf(b) +-- !query 29 schema +struct<> +-- !query 29 output +org.apache.spark.sql.AnalysisException +cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 75 + + +-- !query 30 +SELECT udf(count(b)) FROM test_missing_target GROUP BY b/2 ORDER BY udf(b/2) +-- !query 30 schema +struct +-- !query 30 output +1 +5 +4 + + +-- !query 31 +SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) + FROM test_missing_target GROUP BY lower(c) ORDER BY udf(lower(c)) +-- !query 31 schema +struct +-- !query 31 output +abab 2 +bbbb 3 +cccc 4 +xxxx 1 + + +-- !query 32 +SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d))) +-- !query 32 schema +struct +-- !query 32 output +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 + + +-- !query 33 +SELECT udf(count(b)) FROM test_missing_target + GROUP BY (b + 1) / 2 ORDER BY udf((b + 1) / 2) desc +-- !query 33 schema +struct +-- !query 33 output +7 +3 + + +-- !query 34 +SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY b/2 ORDER BY udf(b/2) +-- !query 34 schema +struct<> +-- !query 34 output +org.apache.spark.sql.AnalysisException +Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 + + +-- !query 35 +SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x, +test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b/2 ORDER BY udf(x.b/2) +-- !query 35 schema +struct +-- !query 35 output +0 1 +1 5 +2 4 + + +-- !query 36 +SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y + WHERE udf(x.a) = udf(y.a) + GROUP BY x.b/2 +-- !query 36 schema +struct<> +-- !query 36 output +org.apache.spark.sql.AnalysisException +Reference 'b' is ambiguous, could be: x.b, y.b.; line 1 pos 21 + + +-- !query 37 +DROP TABLE test_missing_target +-- !query 37 schema +struct<> +-- !query 37 output +