-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-17008][SPARK-17009][SQL] Normalization and isolation in SQLQueryTestSuite. #14590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| -- date time functions | ||
|
|
||
| -- [SPARK-16836] current_date and current_timestamp literals | ||
| select current_date = current_date(), current_timestamp = current_timestamp(); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| create temporary view hav as select * from values | ||
| ("one", 1), | ||
| ("two", 2), | ||
| ("three", 3), | ||
| ("one", 5) | ||
| as hav(k, v); | ||
|
|
||
| -- having clause | ||
| SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2; | ||
|
|
||
| -- having condition contains grouping column | ||
| SELECT count(k) FROM hav GROUP BY v + 1 HAVING v + 1 = 2; | ||
|
|
||
| -- SPARK-11032: resolve having correctly | ||
| SELECT MIN(t.v) FROM (SELECT * FROM hav WHERE v > 0) t HAVING(COUNT(1) > 0); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| create temporary view nt1 as select * from values | ||
| ("one", 1), | ||
| ("two", 2), | ||
| ("three", 3) | ||
| as nt1(k, v1); | ||
|
|
||
| create temporary view nt2 as select * from values | ||
| ("one", 1), | ||
| ("two", 22), | ||
| ("one", 5) | ||
| as nt2(k, v2); | ||
|
|
||
|
|
||
| SELECT * FROM nt1 natural join nt2 where k = "one"; | ||
|
|
||
| SELECT * FROM nt1 natural left join nt2 order by v1, v2; | ||
|
|
||
| SELECT * FROM nt1 natural right join nt2 order by v1, v2; | ||
|
|
||
| SELECT count(*) FROM nt1 natural full outer join nt2; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| -- Automatically generated by org.apache.spark.sql.SQLQueryTestSuite | ||
| -- Number of queries: 1 | ||
|
|
||
|
|
||
| -- !query 0 | ||
| select current_date = current_date(), current_timestamp = current_timestamp() | ||
| -- !query 0 schema | ||
| struct<(current_date() = current_date()):boolean,(current_timestamp() = current_timestamp()):boolean> | ||
| -- !query 0 output | ||
| true true | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| -- Automatically generated by org.apache.spark.sql.SQLQueryTestSuite | ||
| -- Number of queries: 4 | ||
|
|
||
|
|
||
| -- !query 0 | ||
| create temporary view hav as select * from values | ||
| ("one", 1), | ||
| ("two", 2), | ||
| ("three", 3), | ||
| ("one", 5) | ||
| as hav(k, v) | ||
| -- !query 0 schema | ||
| struct<> | ||
| -- !query 0 output | ||
|
|
||
|
|
||
|
|
||
| -- !query 1 | ||
| SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 | ||
| -- !query 1 schema | ||
| struct<k:string,sum(v):bigint> | ||
| -- !query 1 output | ||
| one 6 | ||
| three 3 | ||
|
|
||
|
|
||
| -- !query 2 | ||
| SELECT count(k) FROM hav GROUP BY v + 1 HAVING v + 1 = 2 | ||
| -- !query 2 schema | ||
| struct<count(k):bigint> | ||
| -- !query 2 output | ||
| 1 | ||
|
|
||
|
|
||
| -- !query 3 | ||
| SELECT MIN(t.v) FROM (SELECT * FROM hav WHERE v > 0) t HAVING(COUNT(1) > 0) | ||
| -- !query 3 schema | ||
| struct<min(v):int> | ||
| -- !query 3 output | ||
| 1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| -- Automatically generated by org.apache.spark.sql.SQLQueryTestSuite | ||
| -- Number of queries: 6 | ||
|
|
||
|
|
||
| -- !query 0 | ||
| create temporary view nt1 as select * from values | ||
| ("one", 1), | ||
| ("two", 2), | ||
| ("three", 3) | ||
| as nt1(k, v1) | ||
| -- !query 0 schema | ||
| struct<> | ||
| -- !query 0 output | ||
|
|
||
|
|
||
|
|
||
| -- !query 1 | ||
| create temporary view nt2 as select * from values | ||
| ("one", 1), | ||
| ("two", 22), | ||
| ("one", 5) | ||
| as nt2(k, v2) | ||
| -- !query 1 schema | ||
| struct<> | ||
| -- !query 1 output | ||
|
|
||
|
|
||
|
|
||
| -- !query 2 | ||
| SELECT * FROM nt1 natural join nt2 where k = "one" | ||
| -- !query 2 schema | ||
| struct<k:string,v1:int,v2:int> | ||
| -- !query 2 output | ||
| one 1 1 | ||
| one 1 5 | ||
|
|
||
|
|
||
| -- !query 3 | ||
| SELECT * FROM nt1 natural left join nt2 order by v1, v2 | ||
| -- !query 3 schema | ||
| struct<k:string,v1:int,v2:int> | ||
| -- !query 3 output | ||
| one 1 1 | ||
| one 1 5 | ||
| two 2 22 | ||
| three 3 NULL | ||
|
|
||
|
|
||
| -- !query 4 | ||
| SELECT * FROM nt1 natural right join nt2 order by v1, v2 | ||
| -- !query 4 schema | ||
| struct<k:string,v1:int,v2:int> | ||
| -- !query 4 output | ||
| one 1 1 | ||
| one 1 5 | ||
| two 2 22 | ||
|
|
||
|
|
||
| -- !query 5 | ||
| SELECT count(*) FROM nt1 natural full outer join nt2 | ||
| -- !query 5 schema | ||
| struct<count(1):bigint> | ||
| -- !query 5 output | ||
| 4 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,9 +20,12 @@ package org.apache.spark.sql | |
| import java.io.File | ||
| import java.util.{Locale, TimeZone} | ||
|
|
||
| import org.apache.spark.sql.catalyst.planning.PhysicalOperation | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules.RuleExecutor | ||
| import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile} | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| /** | ||
| * End-to-end test cases for SQL queries. | ||
|
|
@@ -126,14 +129,18 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { | |
| cleaned.split("(?<=[^\\\\]);").map(_.trim).filter(_ != "").toSeq | ||
| } | ||
|
|
||
| // Create a local SparkSession to have stronger isolation between different test cases. | ||
| // This does not isolate catalog changes. | ||
| val localSparkSession = spark.newSession() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it expensive? I do remember other tests share one spark session for performance reasons.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SparkSession should be fine. SparkContext is the expensive one. |
||
|
|
||
| // Run the SQL queries preparing them for comparison. | ||
| val outputs: Seq[QueryOutput] = queries.map { sql => | ||
| val df = spark.sql(sql) | ||
| val (schema, output) = getNormalizedResult(localSparkSession, sql) | ||
| // We might need to do some query canonicalization in the future. | ||
| QueryOutput( | ||
| sql = sql, | ||
| schema = df.schema.catalogString, | ||
| output = df.queryExecution.hiveResultString().mkString("\n")) | ||
| schema = schema.catalogString, | ||
| output = output.mkString("\n")) | ||
| } | ||
|
|
||
| if (regenerateGoldenFiles) { | ||
|
|
@@ -176,6 +183,23 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { | |
| } | ||
| } | ||
|
|
||
| /** Executes a query and returns the result as (schema of the output, normalized output). */ | ||
| private def getNormalizedResult(session: SparkSession, sql: String): (StructType, Seq[String]) = { | ||
| // Returns true if the plan is supposed to be sorted. | ||
| def isSorted(plan: LogicalPlan): Boolean = plan match { | ||
| case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false | ||
| case PhysicalOperation(_, _, Sort(_, true, _)) => true | ||
| case _ => plan.children.iterator.exists(isSorted) | ||
| } | ||
|
|
||
| val df = session.sql(sql) | ||
| val schema = df.schema | ||
| val answer = df.queryExecution.hiveResultString() | ||
|
|
||
| // If the output is not pre-sorted, sort it. | ||
| if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted) | ||
| } | ||
|
|
||
| private def listTestCases(): Seq[TestCase] = { | ||
| listFilesRecursively(new File(inputFilePath)).map { file => | ||
| val resultFile = file.getAbsolutePath.replace(inputFilePath, goldenFilePath) + ".out" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be better to remove the package name so we don't need to change all the generated files when we move this class.