From 53699d168f1a048245253d89e46a3d1cb12cf713 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Tue, 4 Feb 2020 23:20:19 +0800 Subject: [PATCH 1/6] change the default behavior --- docs/sql-migration-guide.md | 2 +- .../catalyst/analysis/CTESubstitution.scala | 39 +++- .../apache/spark/sql/internal/SQLConf.scala | 4 +- .../sql-tests/inputs/cte-nonlegacy.sql | 115 ++++++++++ .../sql-tests/results/cte-nonlegacy.sql.out | 208 ++++++++++++++++++ .../resources/sql-tests/results/cte.sql.out | 50 +++-- 6 files changed, 393 insertions(+), 25 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index e4d2358b5de09..ac6c8576e38d1 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -103,7 +103,7 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. - - Since Spark 3.0, substitution order of nested WITH clauses is changed and an inner CTE definition takes precedence over an outer. In version 2.4 and earlier, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `1` while in version 3.0 it returns `2`. The previous behaviour can be restored by setting `spark.sql.legacy.ctePrecedence.enabled` to `true`. + - Since Spark 3.0, an AnalysisException will happen by default in case of nested WITH clauses, it forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false, inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `1` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, `select add_months(DATE'2019-02-28', 1)` results `2019-03-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to `2019-02-28` results in `2019-03-31`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index 60e6bf8db06d7..f3d47b6e277aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -17,24 +17,57 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED /** * Analyze WITH nodes and substitute child plan with CTE definitions. */ object CTESubstitution extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = { - if (SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED)) { + if (SQLConf.get.legacyCTEPrecedenceEnabled.isEmpty) { + if (hasNestedCTE(plan, inTraverse = false)) { + throw new AnalysisException("The ambiguous nested CTE was founded in current plan, " + + "please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, " + + "see more detail in SPARK-28228.") + } else { + traverseAndSubstituteCTE(plan, inTraverse = false) + } + } else if (SQLConf.get.legacyCTEPrecedenceEnabled.get) { legacyTraverseAndSubstituteCTE(plan) } else { - traverseAndSubstituteCTE(plan, false) + traverseAndSubstituteCTE(plan, inTraverse = false) } } + /** + * Check the plan to be traversed has nested CTE or not, find the nested WITH clause in + * child, innerChildren and subquery for the current plan. + */ + private def hasNestedCTE(plan: LogicalPlan, inTraverse: Boolean): Boolean = { + plan.foreach { + case _: With if inTraverse => + return true + + case w @ With(child: LogicalPlan, _) => + (w.innerChildren :+ child).foreach { p => + if (hasNestedCTE(p, inTraverse = true)) return true + } + + case other if inTraverse => + other.transformExpressions { + case e: SubqueryExpression if hasNestedCTE(e.plan, inTraverse = true) => + return true + } + + case _ => + } + false + } + private def legacyTraverseAndSubstituteCTE(plan: LogicalPlan): LogicalPlan = { plan.resolveOperatorsUp { case With(child, relations) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3ad3416256c7d..4dbdee9fb2ba4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2079,7 +2079,7 @@ object SQLConf { .internal() .doc("When true, outer CTE definitions takes precedence over inner definitions.") .booleanConf - .createWithDefault(false) + .createOptional val LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC = buildConf("spark.sql.legacy.arrayExistsFollowsThreeValuedLogic") @@ -2714,6 +2714,8 @@ class SQLConf extends Serializable with Logging { def csvFilterPushDown: Boolean = getConf(CSV_FILTER_PUSHDOWN_ENABLED) + def legacyCTEPrecedenceEnabled: Option[Boolean] = getConf(LEGACY_CTE_PRECEDENCE_ENABLED) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql new file mode 100644 index 0000000000000..599e2edb37cb6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -0,0 +1,115 @@ +create temporary view t as select * from values 0, 1, 2 as t(id); +create temporary view t2 as select * from values 0, 1 as t(id); + +-- CTE legacy substitution +SET spark.sql.legacy.ctePrecedence.enabled=false; + +-- CTE in CTE definition +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t; + +-- CTE in subquery +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +); + +-- CTE in subquery expression +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +); + +-- CTE in CTE definition shadows outer +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2; + +-- CTE in CTE definition shadows outer 2 +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2; + +-- CTE in CTE definition shadows outer 3 +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2; + +-- CTE in subquery shadows outer +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +); + +-- CTE in subquery shadows outer 2 +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +); + +-- CTE in subquery shadows outer 3 +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +); + +-- CTE in subquery expression shadows outer +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +); + +-- CTE in subquery expression shadows outer 2 +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +); + +-- CTE in subquery expression shadows outer 3 +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +); + +-- Clean up +DROP VIEW IF EXISTS t; +DROP VIEW IF EXISTS t2; diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out new file mode 100644 index 0000000000000..f44ac8176c599 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -0,0 +1,208 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 17 + + +-- !query +create temporary view t as select * from values 0, 1, 2 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view t2 as select * from values 0, 1 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +SET spark.sql.legacy.ctePrecedence.enabled=false +-- !query schema +struct +-- !query output +spark.sql.legacy.ctePrecedence.enabled false + + +-- !query +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2 +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2 +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2 +-- !query schema +struct<3:int> +-- !query output +3 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +DROP VIEW IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index 2d87781193c25..dfd68d7e634f0 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -168,9 +168,10 @@ WITH t as ( ) SELECT * FROM t -- !query schema -struct<1:int> +struct<> -- !query output -1 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -204,9 +205,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct<2:int> +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -222,9 +224,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -240,9 +243,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct<3:int> +struct<> -- !query output -3 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -252,9 +256,10 @@ SELECT max(c) FROM ( SELECT * FROM t ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -266,9 +271,10 @@ SELECT sum(c) FROM ( ) ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -281,9 +287,10 @@ SELECT sum(c) FROM ( ) ) -- !query schema -struct +struct<> -- !query output -3 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -293,9 +300,10 @@ SELECT ( SELECT * FROM t ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -307,9 +315,10 @@ SELECT ( ) ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -322,9 +331,10 @@ SELECT ( ) ) -- !query schema -struct +struct<> -- !query output -3 +org.apache.spark.sql.AnalysisException +The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query From f75b26a5aeab71bdc0a4a2f4f6da7075f6b2f996 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Wed, 5 Feb 2020 11:04:33 +0800 Subject: [PATCH 2/6] fix influence test cases --- .../sql-tests/inputs/cte-nonlegacy.sql | 2 +- .../sql-tests/inputs/postgreSQL/with.sql | 3 ++ .../sql-tests/results/postgreSQL/with.sql.out | 10 +++- .../org/apache/spark/sql/SubquerySuite.scala | 46 ++++++++++--------- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql index 599e2edb37cb6..796c4b768e9da 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -1,7 +1,7 @@ create temporary view t as select * from values 0, 1, 2 as t(id); create temporary view t2 as select * from values 0, 1 as t(id); --- CTE legacy substitution +-- CTE non-legacy substitution SET spark.sql.legacy.ctePrecedence.enabled=false; -- CTE in CTE definition diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql index 83c6fd8cbac91..65ed5de33834d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql @@ -685,6 +685,9 @@ with cte(foo) as ( select 42 ) select * from ((select foo from cte)) q; -- test WITH attached to intermediate-level set operation -- +-- CTE non-legacy substitution +SET spark.sql.legacy.ctePrecedence.enabled=false; + WITH outermost(x) AS ( SELECT 1 UNION (WITH innermost as (SELECT 2) diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out index badafc9e659e2..68064347710f5 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 51 +-- Number of queries: 52 -- !query @@ -191,6 +191,14 @@ struct 42 +-- !query +SET spark.sql.legacy.ctePrecedence.enabled=false +-- !query schema +struct +-- !query output +spark.sql.legacy.ctePrecedence.enabled false + + -- !query WITH outermost(x) AS ( SELECT 1 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index ff8f94c68c5ee..d06eda85d42bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -103,28 +103,30 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("define CTE in CTE subquery") { - checkAnswer( - sql( - """ - | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1) - | select a from (select 1 as a union all select 2 as a) t - | where a = (select max(b) from t2) - """.stripMargin), - Array(Row(1)) - ) - checkAnswer( - sql( - """ - | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1), - | t3 as ( - | with t4 as (select 1 as d, 3 as e) - | select * from t4 cross join t2 where t2.b = t4.d - | ) - | select a from (select 1 as a union all select 2 as a) - | where a = (select max(d) from t3) - """.stripMargin), - Array(Row(1)) - ) + withSQLConf(SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED.key -> "false") { + checkAnswer( + sql( + """ + | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1) + | select a from (select 1 as a union all select 2 as a) t + | where a = (select max(b) from t2) + """.stripMargin), + Array(Row(1)) + ) + checkAnswer( + sql( + """ + | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1), + | t3 as ( + | with t4 as (select 1 as d, 3 as e) + | select * from t4 cross join t2 where t2.b = t4.d + | ) + | select a from (select 1 as a union all select 2 as a) + | where a = (select max(d) from t3) + """.stripMargin), + Array(Row(1)) + ) + } } test("uncorrelated scalar subquery in CTE") { From a55e82dc3275f6e877eacdceed2de8e55daa2f5d Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Thu, 6 Feb 2020 03:01:39 +0800 Subject: [PATCH 3/6] address comment --- docs/sql-migration-guide.md | 2 +- .../catalyst/analysis/CTESubstitution.scala | 38 +++++++++------ .../apache/spark/sql/internal/SQLConf.scala | 2 - .../sql-tests/inputs/postgreSQL/with.sql | 3 -- .../resources/sql-tests/results/cte.sql.out | 23 +++++----- .../sql-tests/results/postgreSQL/with.sql.out | 10 +--- .../org/apache/spark/sql/SubquerySuite.scala | 46 +++++++++---------- 7 files changed, 59 insertions(+), 65 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index ac6c8576e38d1..f93cd6c90aa93 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -103,7 +103,7 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. - - Since Spark 3.0, an AnalysisException will happen by default in case of nested WITH clauses, it forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false, inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `1` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. + - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false, inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, `select add_months(DATE'2019-02-28', 1)` results `2019-03-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to `2019-02-28` results in `2019-03-31`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index f3d47b6e277aa..7e977241bc45c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -22,21 +22,23 @@ import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED /** * Analyze WITH nodes and substitute child plan with CTE definitions. */ object CTESubstitution extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = { - if (SQLConf.get.legacyCTEPrecedenceEnabled.isEmpty) { - if (hasNestedCTE(plan, inTraverse = false)) { - throw new AnalysisException("The ambiguous nested CTE was founded in current plan, " + - "please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, " + + val isLegacy = SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED) + if (isLegacy.isEmpty) { + if (detectConflictInNestedCTE(plan, inTraverse = false)) { + throw new AnalysisException("Name collision in nested CTE was founded in current plan, " + + s"please select the CTE precedence via ${LEGACY_CTE_PRECEDENCE_ENABLED.key}, " + "see more detail in SPARK-28228.") } else { traverseAndSubstituteCTE(plan, inTraverse = false) } - } else if (SQLConf.get.legacyCTEPrecedenceEnabled.get) { + } else if (isLegacy.get) { legacyTraverseAndSubstituteCTE(plan) } else { traverseAndSubstituteCTE(plan, inTraverse = false) @@ -44,23 +46,31 @@ object CTESubstitution extends Rule[LogicalPlan] { } /** - * Check the plan to be traversed has nested CTE or not, find the nested WITH clause in + * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through * child, innerChildren and subquery for the current plan. */ - private def hasNestedCTE(plan: LogicalPlan, inTraverse: Boolean): Boolean = { + private def detectConflictInNestedCTE( + plan: LogicalPlan, + inTraverse: Boolean, + cteNames: Set[String] = Set.empty): Boolean = { plan.foreach { - case _: With if inTraverse => - return true - - case w @ With(child: LogicalPlan, _) => + case w @ With(child, relations) => + val newNames = relations.map { + case (cteName, _) => + if (cteNames.contains(cteName)) { + return true + } else { + cteName + } + }.toSet (w.innerChildren :+ child).foreach { p => - if (hasNestedCTE(p, inTraverse = true)) return true + if (detectConflictInNestedCTE(p, inTraverse = true, cteNames ++ newNames)) return true } case other if inTraverse => other.transformExpressions { - case e: SubqueryExpression if hasNestedCTE(e.plan, inTraverse = true) => - return true + case e: SubqueryExpression + if detectConflictInNestedCTE(e.plan, inTraverse = true, cteNames) => return true } case _ => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4dbdee9fb2ba4..1e01d4cf6a042 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2714,8 +2714,6 @@ class SQLConf extends Serializable with Logging { def csvFilterPushDown: Boolean = getConf(CSV_FILTER_PUSHDOWN_ENABLED) - def legacyCTEPrecedenceEnabled: Option[Boolean] = getConf(LEGACY_CTE_PRECEDENCE_ENABLED) - /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql index 65ed5de33834d..83c6fd8cbac91 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql @@ -685,9 +685,6 @@ with cte(foo) as ( select 42 ) select * from ((select foo from cte)) q; -- test WITH attached to intermediate-level set operation -- --- CTE non-legacy substitution -SET spark.sql.legacy.ctePrecedence.enabled=false; - WITH outermost(x) AS ( SELECT 1 UNION (WITH innermost as (SELECT 2) diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index dfd68d7e634f0..2ef4ef1c8399c 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -168,10 +168,9 @@ WITH t as ( ) SELECT * FROM t -- !query schema -struct<> +struct<1:int> -- !query output -org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +1 -- !query @@ -208,7 +207,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -227,7 +226,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -246,7 +245,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -259,7 +258,7 @@ SELECT max(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -274,7 +273,7 @@ SELECT sum(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -290,7 +289,7 @@ SELECT sum(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -303,7 +302,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -318,7 +317,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query @@ -334,7 +333,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -The ambiguous nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out index 68064347710f5..badafc9e659e2 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 52 +-- Number of queries: 51 -- !query @@ -191,14 +191,6 @@ struct 42 --- !query -SET spark.sql.legacy.ctePrecedence.enabled=false --- !query schema -struct --- !query output -spark.sql.legacy.ctePrecedence.enabled false - - -- !query WITH outermost(x) AS ( SELECT 1 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index d06eda85d42bb..ff8f94c68c5ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -103,30 +103,28 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("define CTE in CTE subquery") { - withSQLConf(SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED.key -> "false") { - checkAnswer( - sql( - """ - | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1) - | select a from (select 1 as a union all select 2 as a) t - | where a = (select max(b) from t2) - """.stripMargin), - Array(Row(1)) - ) - checkAnswer( - sql( - """ - | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1), - | t3 as ( - | with t4 as (select 1 as d, 3 as e) - | select * from t4 cross join t2 where t2.b = t4.d - | ) - | select a from (select 1 as a union all select 2 as a) - | where a = (select max(d) from t3) - """.stripMargin), - Array(Row(1)) - ) - } + checkAnswer( + sql( + """ + | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1) + | select a from (select 1 as a union all select 2 as a) t + | where a = (select max(b) from t2) + """.stripMargin), + Array(Row(1)) + ) + checkAnswer( + sql( + """ + | with t2 as (with t1 as (select 1 as b, 2 as c) select b, c from t1), + | t3 as ( + | with t4 as (select 1 as d, 3 as e) + | select * from t4 cross join t2 where t2.b = t4.d + | ) + | select a from (select 1 as a union all select 2 as a) + | where a = (select max(d) from t3) + """.stripMargin), + Array(Row(1)) + ) } test("uncorrelated scalar subquery in CTE") { From c03920a5bf4efc3e6867b5ddde120003bc63f0c5 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Thu, 6 Feb 2020 18:46:28 +0800 Subject: [PATCH 4/6] address comment --- docs/sql-migration-guide.md | 2 +- .../catalyst/analysis/CTESubstitution.scala | 25 ++- .../sql-tests/inputs/cte-nonlegacy.sql | 117 +------------- .../sql-tests/results/cte-nonlegacy.sql.out | 143 +++++++++++++++++- .../resources/sql-tests/results/cte.sql.out | 18 +-- 5 files changed, 162 insertions(+), 143 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index f93cd6c90aa93..84374650a6982 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -103,7 +103,7 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. - - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false, inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. + - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false (which is recommended), inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, `select add_months(DATE'2019-02-28', 1)` results `2019-03-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to `2019-02-28` results in `2019-03-31`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index 7e977241bc45c..5dbdedf27b859 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -31,13 +31,8 @@ object CTESubstitution extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = { val isLegacy = SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED) if (isLegacy.isEmpty) { - if (detectConflictInNestedCTE(plan, inTraverse = false)) { - throw new AnalysisException("Name collision in nested CTE was founded in current plan, " + - s"please select the CTE precedence via ${LEGACY_CTE_PRECEDENCE_ENABLED.key}, " + - "see more detail in SPARK-28228.") - } else { - traverseAndSubstituteCTE(plan, inTraverse = false) - } + assertNoNameConflictsInCTE(plan, inTraverse = false) + traverseAndSubstituteCTE(plan, inTraverse = false) } else if (isLegacy.get) { legacyTraverseAndSubstituteCTE(plan) } else { @@ -49,33 +44,35 @@ object CTESubstitution extends Rule[LogicalPlan] { * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through * child, innerChildren and subquery for the current plan. */ - private def detectConflictInNestedCTE( + private def assertNoNameConflictsInCTE( plan: LogicalPlan, inTraverse: Boolean, - cteNames: Set[String] = Set.empty): Boolean = { + cteNames: Set[String] = Set.empty): Unit = { plan.foreach { case w @ With(child, relations) => val newNames = relations.map { case (cteName, _) => if (cteNames.contains(cteName)) { - return true + throw new AnalysisException(s"Name $cteName is conflict in nested CTE. " + + s"Please set ${LEGACY_CTE_PRECEDENCE_ENABLED.key} to false so that name defined " + + "in inner CTE takes precedence. See more details in SPARK-28228.") } else { cteName } }.toSet (w.innerChildren :+ child).foreach { p => - if (detectConflictInNestedCTE(p, inTraverse = true, cteNames ++ newNames)) return true + assertNoNameConflictsInCTE(p, inTraverse = true, cteNames ++ newNames) } case other if inTraverse => other.transformExpressions { - case e: SubqueryExpression - if detectConflictInNestedCTE(e.plan, inTraverse = true, cteNames) => return true + case e: SubqueryExpression => + assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames) + e } case _ => } - false } private def legacyTraverseAndSubstituteCTE(plan: LogicalPlan): LogicalPlan = { diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql index 796c4b768e9da..b711bf338ab08 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -1,115 +1,2 @@ -create temporary view t as select * from values 0, 1, 2 as t(id); -create temporary view t2 as select * from values 0, 1 as t(id); - --- CTE non-legacy substitution -SET spark.sql.legacy.ctePrecedence.enabled=false; - --- CTE in CTE definition -WITH t as ( - WITH t2 AS (SELECT 1) - SELECT * FROM t2 -) -SELECT * FROM t; - --- CTE in subquery -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 1) - SELECT * FROM t -); - --- CTE in subquery expression -SELECT ( - WITH t AS (SELECT 1) - SELECT * FROM t -); - --- CTE in CTE definition shadows outer -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 2 -WITH - t(c) AS (SELECT 1), - t2 AS ( - SELECT ( - SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) - ) - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 3 -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2), - t2 AS ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) - SELECT * FROM t2 - ) -SELECT * FROM t2; - --- CTE in subquery shadows outer -WITH t(c) AS (SELECT 1) -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery shadows outer 2 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery shadows outer 3 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 3) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery expression shadows outer 2 -WITH t AS (SELECT 1) -SELECT ( - SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer 3 -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) -); - --- Clean up -DROP VIEW IF EXISTS t; -DROP VIEW IF EXISTS t2; +--SET spark.sql.legacy.ctePrecedence.enabled = false +--IMPORT cte.sql diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out index f44ac8176c599..2d87781193c25 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 +-- Number of queries: 27 -- !query @@ -19,11 +19,146 @@ struct<> -- !query -SET spark.sql.legacy.ctePrecedence.enabled=false +WITH s AS (SELECT 1 FROM s) SELECT * FROM s -- !query schema -struct +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s; line 1 pos 25 + + +-- !query +WITH r AS (SELECT (SELECT * FROM r)) +SELECT * FROM r +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: r; line 1 pos 33 + + +-- !query +WITH t AS (SELECT 1 FROM t) SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 + + +-- !query +WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s2; line 1 pos 26 + + +-- !query +WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2 +-- !query schema +struct +-- !query output +0 2 +0 2 +1 2 +1 2 + + +-- !query +WITH CTE1 AS ( + SELECT b.id AS id + FROM T2 a + CROSS JOIN (SELECT id AS id FROM T2) b +) +SELECT t1.id AS c1, + t2.id AS c2 +FROM CTE1 t1 + CROSS JOIN CTE1 t2 +-- !query schema +struct +-- !query output +0 0 +0 0 +0 0 +0 0 +0 1 +0 1 +0 1 +0 1 +1 0 +1 0 +1 0 +1 0 +1 1 +1 1 +1 1 +1 1 + + +-- !query +WITH t(x) AS (SELECT 1) +SELECT * FROM t WHERE x = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH t(x, y) AS (SELECT 1, 2) +SELECT * FROM t WHERE x = 1 AND y = 2 +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t(x, x) AS (SELECT 1, 2) +SELECT * FROM t +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t() AS (SELECT 1) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'WITH t()'(line 1, pos 7) + +== SQL == +WITH t() AS (SELECT 1) +-------^^^ +SELECT * FROM t + + +-- !query +WITH + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<> -- !query output -spark.sql.legacy.ctePrecedence.enabled false +org.apache.spark.sql.catalyst.parser.ParseException + +CTE definition can't have duplicate names: 't'.(line 1, pos 0) + +== SQL == +WITH +^^^ + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index 2ef4ef1c8399c..fb916a99b3e4a 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -207,7 +207,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -226,7 +226,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -245,7 +245,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -258,7 +258,7 @@ SELECT max(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -273,7 +273,7 @@ SELECT sum(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -289,7 +289,7 @@ SELECT sum(c) FROM ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -302,7 +302,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -317,7 +317,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -333,7 +333,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name collision in nested CTE was founded in current plan, please select the CTE precedence via spark.sql.legacy.ctePrecedence.enabled, see more detail in SPARK-28228.; +Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query From 72494043d5a431dbe024e7472b4df0e5d3b96352 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Fri, 7 Feb 2020 22:41:06 +0800 Subject: [PATCH 5/6] address comments --- docs/sql-migration-guide.md | 2 +- .../catalyst/analysis/CTESubstitution.scala | 9 +++++-- .../apache/spark/sql/internal/SQLConf.scala | 4 ++- .../resources/sql-tests/results/cte.sql.out | 27 +++++++++---------- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 84374650a6982..d69b3d68583f7 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -103,7 +103,7 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. - - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false (which is recommended), inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. + - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause by default. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false (which is recommended), inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, `select add_months(DATE'2019-02-28', 1)` results `2019-03-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to `2019-02-28` results in `2019-03-31`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index 5dbdedf27b859..d2be15d87d023 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -53,14 +53,19 @@ object CTESubstitution extends Rule[LogicalPlan] { val newNames = relations.map { case (cteName, _) => if (cteNames.contains(cteName)) { - throw new AnalysisException(s"Name $cteName is conflict in nested CTE. " + + throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " + s"Please set ${LEGACY_CTE_PRECEDENCE_ENABLED.key} to false so that name defined " + "in inner CTE takes precedence. See more details in SPARK-28228.") } else { cteName } }.toSet - (w.innerChildren :+ child).foreach { p => + child.transformExpressions { + case e: SubqueryExpression => + assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames ++ newNames) + e + } + w.innerChildren.foreach { p => assertNoNameConflictsInCTE(p, inTraverse = true, cteNames ++ newNames) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1e01d4cf6a042..dcae556d722e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2077,7 +2077,9 @@ object SQLConf { val LEGACY_CTE_PRECEDENCE_ENABLED = buildConf("spark.sql.legacy.ctePrecedence.enabled") .internal() - .doc("When true, outer CTE definitions takes precedence over inner definitions.") + .doc("When true, outer CTE definitions takes precedence over inner definitions. If set to " + + "false, inner CTE definitions take precedence. The default value is empty, which keep the " + + "behavior of throwing an AnalysisException while name conflict is detected in nested CTE.") .booleanConf .createOptional diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index fb916a99b3e4a..1d50aa8f57505 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -207,7 +207,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -226,7 +226,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -245,7 +245,7 @@ SELECT * FROM t2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -255,10 +255,9 @@ SELECT max(c) FROM ( SELECT * FROM t ) -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +2 -- !query @@ -270,10 +269,9 @@ SELECT sum(c) FROM ( ) ) -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +2 -- !query @@ -286,10 +284,9 @@ SELECT sum(c) FROM ( ) ) -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +3 -- !query @@ -302,7 +299,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -317,7 +314,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -333,7 +330,7 @@ SELECT ( struct<> -- !query output org.apache.spark.sql.AnalysisException -Name t is conflict in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query From ebd337b9250b9a1426232fe16d5eda92e16b23c8 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Sat, 8 Feb 2020 09:42:27 +0800 Subject: [PATCH 6/6] rephrase --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index dcae556d722e0..67457f823eb60 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2078,8 +2078,8 @@ object SQLConf { val LEGACY_CTE_PRECEDENCE_ENABLED = buildConf("spark.sql.legacy.ctePrecedence.enabled") .internal() .doc("When true, outer CTE definitions takes precedence over inner definitions. If set to " + - "false, inner CTE definitions take precedence. The default value is empty, which keep the " + - "behavior of throwing an AnalysisException while name conflict is detected in nested CTE.") + "false, inner CTE definitions take precedence. The default value is empty, " + + "AnalysisException is thrown while name conflict is detected in nested CTE.") .booleanConf .createOptional