diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0900f8317d635..a1e019cbec4d2 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1876,6 +1876,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see ## Upgrading From Spark SQL 2.3 to 2.4 + - Since Spark 2.4, Spark will evaluate the set operations referenced in a query by following a precedence rule as per the SQL standard. If the order is not specified by parentheses, set operations are performed from left to right with the exception that all INTERSECT operations are performed before any UNION, EXCEPT or MINUS operations. The old behaviour of giving equal precedence to all the set operations are preserved under a newly added configuaration `spark.sql.legacy.setopsPrecedence.enabled` with a default value of `false`. When this property is set to `true`, spark will evaluate the set operators from left to right as they appear in the query given no explicit ordering is enforced by usage of parenthesis. - Since Spark 2.4, Spark will display table description column Last Access value as UNKNOWN when the value was Jan 01 1970. - Since Spark 2.4, Spark maximizes the usage of a vectorized ORC reader for ORC files by default. To do that, `spark.sql.orc.impl` and `spark.sql.orc.filterPushdown` change their default values to `native` and `true` respectively. - In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when Arrow optimization is unable to be used whereas `createDataFrame` from Pandas DataFrame allowed the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`. diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 2aca10f1bfbc7..884da9ee45667 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -17,6 +17,12 @@ grammar SqlBase; @members { + /** + * When false, INTERSECT is given the greater precedence over the other set + * operations (UNION, EXCEPT and MINUS) as per the SQL standard. + */ + public boolean legacy_setops_precedence_enbled = false; + /** * Verify whether current token is a valid decimal token (which contains dot). * Returns true if the character that follows the token is not a digit or letter or underscore. @@ -352,8 +358,13 @@ multiInsertQueryBody ; queryTerm - : queryPrimary #queryTermDefault - | left=queryTerm operator=(INTERSECT | UNION | EXCEPT | SETMINUS) setQuantifier? right=queryTerm #setOperation + : queryPrimary #queryTermDefault + | left=queryTerm {legacy_setops_precedence_enbled}? + operator=(INTERSECT | UNION | EXCEPT | SETMINUS) setQuantifier? right=queryTerm #setOperation + | left=queryTerm {!legacy_setops_precedence_enbled}? + operator=INTERSECT setQuantifier? right=queryTerm #setOperation + | left=queryTerm {!legacy_setops_precedence_enbled}? + operator=(UNION | EXCEPT | SETMINUS) setQuantifier? right=queryTerm #setOperation ; queryPrimary diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 98708545c4bfc..7997e79003b12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -356,9 +356,11 @@ package object dsl { def subquery(alias: Symbol): LogicalPlan = SubqueryAlias(alias.name, logicalPlan) - def except(otherPlan: LogicalPlan): LogicalPlan = Except(logicalPlan, otherPlan) + def except(otherPlan: LogicalPlan, isAll: Boolean = false): LogicalPlan = + Except(logicalPlan, otherPlan, isAll) - def intersect(otherPlan: LogicalPlan): LogicalPlan = Intersect(logicalPlan, otherPlan) + def intersect(otherPlan: LogicalPlan, isAll: Boolean = false): LogicalPlan = + Intersect(logicalPlan, otherPlan, isAll) def union(otherPlan: LogicalPlan): LogicalPlan = Union(logicalPlan, otherPlan) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index 4c20f2368bded..7d8cb1f18b4b5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -84,12 +84,14 @@ abstract class AbstractSqlParser extends ParserInterface with Logging { val lexer = new SqlBaseLexer(new UpperCaseCharStream(CharStreams.fromString(command))) lexer.removeErrorListeners() lexer.addErrorListener(ParseErrorListener) + lexer.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced val tokenStream = new CommonTokenStream(lexer) val parser = new SqlBaseParser(tokenStream) parser.addParseListener(PostProcessor) parser.removeErrorListeners() parser.addErrorListener(ParseErrorListener) + parser.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced try { try { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 68413d7fd10f1..d7dbdb39a9afb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -165,9 +165,9 @@ object SetOperation { } case class Intersect( - left: LogicalPlan, - right: LogicalPlan, - isAll: Boolean = false) extends SetOperation(left, right) { + left: LogicalPlan, + right: LogicalPlan, + isAll: Boolean = false) extends SetOperation(left, right) { override def nodeName: String = getClass.getSimpleName + ( if ( isAll ) "All" else "" ) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index edc1a488150c2..596d826b15f54 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1451,6 +1451,16 @@ object SQLConf { .intConf .checkValues((1 to 9).toSet + Deflater.DEFAULT_COMPRESSION) .createWithDefault(Deflater.DEFAULT_COMPRESSION) + + val LEGACY_SETOPS_PRECEDENCE_ENABLED = + buildConf("spark.sql.legacy.setopsPrecedence.enabled") + .internal() + .doc("When set to true and the order of evaluation is not specified by parentheses, the " + + "set operations are performed from left to right as they appear in the query. When set " + + "to false and order of evaluation is not specified by parentheses, INTERSECT operations " + + "are performed before any UNION, EXCEPT and MINUS operations.") + .booleanConf + .createWithDefault(false) } /** @@ -1841,6 +1851,8 @@ class SQLConf extends Serializable with Logging { def avroDeflateLevel: Int = getConf(SQLConf.AVRO_DEFLATE_LEVEL) + def setOpsPrecedenceEnforced: Boolean = getConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 9be0ec5af78ef..38efd89156de6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType /** @@ -676,4 +677,48 @@ class PlanParserSuite extends AnalysisTest { OneRowRelation().select('rtrim.function("c&^,.", "bc...,,,&&&ccc")) ) } + + test("precedence of set operations") { + val a = table("a").select(star()) + val b = table("b").select(star()) + val c = table("c").select(star()) + val d = table("d").select(star()) + + val query1 = + """ + |SELECT * FROM a + |UNION + |SELECT * FROM b + |EXCEPT + |SELECT * FROM c + |INTERSECT + |SELECT * FROM d + """.stripMargin + + val query2 = + """ + |SELECT * FROM a + |UNION + |SELECT * FROM b + |EXCEPT ALL + |SELECT * FROM c + |INTERSECT ALL + |SELECT * FROM d + """.stripMargin + + assertEqual(query1, Distinct(a.union(b)).except(c.intersect(d))) + assertEqual(query2, Distinct(a.union(b)).except(c.intersect(d, isAll = true), isAll = true)) + + // Now disable precedence enforcement to verify the old behaviour. + withSQLConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED.key -> "true") { + assertEqual(query1, Distinct(a.union(b)).except(c).intersect(d)) + assertEqual(query2, Distinct(a.union(b)).except(c, isAll = true).intersect(d, isAll = true)) + } + + // Explicitly enable the precedence enforcement + withSQLConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED.key -> "false") { + assertEqual(query1, Distinct(a.union(b)).except(c.intersect(d))) + assertEqual(query2, Distinct(a.union(b)).except(c.intersect(d, isAll = true), isAll = true)) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 75eff8a88312b..b4179f4d12d35 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -535,14 +535,14 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case logical.Intersect(left, right, true) => throw new IllegalStateException( "logical intersect operator should have been replaced by union, aggregate" + - "and generate operators in the optimizer") + " and generate operators in the optimizer") case logical.Except(left, right, false) => throw new IllegalStateException( "logical except operator should have been replaced by anti-join in the optimizer") case logical.Except(left, right, true) => throw new IllegalStateException( "logical except (all) operator should have been replaced by union, aggregate" + - "and generate operators in the optimizer") + " and generate operators in the optimizer") case logical.DeserializeToObject(deserializer, objAttr, child) => execution.DeserializeToObjectExec(deserializer, objAttr, planLater(child)) :: Nil diff --git a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql index ff4395c3e7447..b0b2244048caa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql @@ -59,29 +59,40 @@ INTERSECT ALL SELECT * FROM tab2; -- Chain of different `set operations --- We need to parenthesize the following two queries to enforce --- certain order of evaluation of operators. After fix to --- SPARK-24966 this can be removed. SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 UNION ALL -( SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 -); +; -- Chain of different `set operations SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 EXCEPT -( SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 -); +; + +-- test use parenthesis to control order of evaluation +( + ( + ( + SELECT * FROM tab1 + EXCEPT + SELECT * FROM tab2 + ) + EXCEPT + SELECT * FROM tab1 + ) + INTERSECT ALL + SELECT * FROM tab2 +) +; -- Join under intersect all SELECT * @@ -118,6 +129,32 @@ SELECT v FROM tab1 GROUP BY v INTERSECT ALL SELECT k FROM tab2 GROUP BY k; +-- Test pre spark2.4 behaviour of set operation precedence +-- All the set operators are given equal precedence and are evaluated +-- from left to right as they appear in the query. + +-- Set the property +SET spark.sql.legacy.setopsPrecedence.enabled= true; + +SELECT * FROM tab1 +EXCEPT +SELECT * FROM tab2 +UNION ALL +SELECT * FROM tab1 +INTERSECT ALL +SELECT * FROM tab2; + +SELECT * FROM tab1 +EXCEPT +SELECT * FROM tab2 +UNION ALL +SELECT * FROM tab1 +INTERSECT +SELECT * FROM tab2; + +-- Restore the property +SET spark.sql.legacy.setopsPrecedence.enabled = false; + -- Clean-up DROP VIEW IF EXISTS tab1; DROP VIEW IF EXISTS tab2; diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out index 792791bc51628..63dd56ce468bc 100644 --- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 +-- Number of queries: 22 -- !query 0 @@ -133,11 +133,9 @@ SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 UNION ALL -( SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 -) -- !query 10 schema struct -- !query 10 output @@ -154,11 +152,9 @@ SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 EXCEPT -( SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 -) -- !query 11 schema struct -- !query 11 output @@ -166,6 +162,26 @@ struct -- !query 12 +( + ( + ( + SELECT * FROM tab1 + EXCEPT + SELECT * FROM tab2 + ) + EXCEPT + SELECT * FROM tab1 + ) + INTERSECT ALL + SELECT * FROM tab2 +) +-- !query 12 schema +struct +-- !query 12 output + + + +-- !query 13 SELECT * FROM (SELECT tab1.k, tab2.v @@ -179,9 +195,9 @@ FROM (SELECT tab1.k, FROM tab1 JOIN tab2 ON tab1.k = tab2.k) --- !query 12 schema +-- !query 13 schema struct --- !query 12 output +-- !query 13 output 1 2 1 2 1 2 @@ -193,7 +209,7 @@ struct 2 3 --- !query 13 +-- !query 14 SELECT * FROM (SELECT tab1.k, tab2.v @@ -207,35 +223,85 @@ FROM (SELECT tab2.v AS k, FROM tab1 JOIN tab2 ON tab1.k = tab2.k) --- !query 13 schema +-- !query 14 schema struct --- !query 13 output +-- !query 14 output --- !query 14 +-- !query 15 SELECT v FROM tab1 GROUP BY v INTERSECT ALL SELECT k FROM tab2 GROUP BY k --- !query 14 schema +-- !query 15 schema struct --- !query 14 output +-- !query 15 output 2 3 NULL --- !query 15 +-- !query 16 +SET spark.sql.legacy.setopsPrecedence.enabled= true +-- !query 16 schema +struct +-- !query 16 output +spark.sql.legacy.setopsPrecedence.enabled true + + +-- !query 17 +SELECT * FROM tab1 +EXCEPT +SELECT * FROM tab2 +UNION ALL +SELECT * FROM tab1 +INTERSECT ALL +SELECT * FROM tab2 +-- !query 17 schema +struct +-- !query 17 output +1 2 +1 2 +2 3 +NULL NULL +NULL NULL + + +-- !query 18 +SELECT * FROM tab1 +EXCEPT +SELECT * FROM tab2 +UNION ALL +SELECT * FROM tab1 +INTERSECT +SELECT * FROM tab2 +-- !query 18 schema +struct +-- !query 18 output +1 2 +2 3 +NULL NULL + + +-- !query 19 +SET spark.sql.legacy.setopsPrecedence.enabled = false +-- !query 19 schema +struct +-- !query 19 output +spark.sql.legacy.setopsPrecedence.enabled false + + +-- !query 20 DROP VIEW IF EXISTS tab1 --- !query 15 schema +-- !query 20 schema struct<> --- !query 15 output +-- !query 20 output --- !query 16 +-- !query 21 DROP VIEW IF EXISTS tab2 --- !query 16 schema +-- !query 21 schema struct<> --- !query 16 output +-- !query 21 output