-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16633] [SPARK-16642] [SPARK-16721] [SQL] Fixes three issues related to lead and lag functions #14284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-16633] [SPARK-16642] [SPARK-16721] [SQL] Fixes three issues related to lead and lag functions #14284
Changes from all commits
78e6901
da5f36f
02ee191
506393b
c2fd2d7
51e6937
faa7d89
073ac94
ff3029e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -582,25 +582,43 @@ private[execution] final class OffsetWindowFunctionFrame( | |
| /** Row used to combine the offset and the current row. */ | ||
| private[this] val join = new JoinedRow | ||
|
|
||
| /** Create the projection. */ | ||
| /** | ||
| * Create the projection used when the offset row exists. | ||
| * Please note that this project always respect null input values (like PostgreSQL). | ||
| */ | ||
| private[this] val projection = { | ||
| // Collect the expressions and bind them. | ||
| val inputAttrs = inputSchema.map(_.withNullability(true)) | ||
| val numInputAttributes = inputAttrs.size | ||
| val boundExpressions = Seq.fill(ordinal)(NoOp) ++ expressions.toSeq.map { | ||
| case e: OffsetWindowFunction => | ||
| val input = BindReferences.bindReference(e.input, inputAttrs) | ||
| input | ||
| case e => | ||
| BindReferences.bindReference(e, inputAttrs) | ||
| } | ||
|
|
||
| // Create the projection. | ||
| newMutableProjection(boundExpressions, Nil).target(target) | ||
| } | ||
|
|
||
| /** Create the projection used when the offset row DOES NOT exists. */ | ||
| private[this] val fillDefaultValue = { | ||
| // Collect the expressions and bind them. | ||
| val inputAttrs = inputSchema.map(_.withNullability(true)) | ||
| val numInputAttributes = inputAttrs.size | ||
| val boundExpressions = Seq.fill(ordinal)(NoOp) ++ expressions.toSeq.map { | ||
| case e: OffsetWindowFunction => | ||
| if (e.default == null || e.default.foldable && e.default.eval() == null) { | ||
| // Without default value. | ||
| input | ||
| // The default value is null. | ||
| Literal.create(null, e.dataType) | ||
| } else { | ||
| // With default value. | ||
| // The default value is an expression. | ||
| val default = BindReferences.bindReference(e.default, inputAttrs).transform { | ||
| // Shift the input reference to its default version. | ||
| case BoundReference(o, dataType, nullable) => | ||
| BoundReference(o + numInputAttributes, dataType, nullable) | ||
| } | ||
| org.apache.spark.sql.catalyst.expressions.Coalesce(input :: default :: Nil) | ||
| default | ||
| } | ||
| case e => | ||
| BindReferences.bindReference(e, inputAttrs) | ||
|
|
@@ -625,10 +643,12 @@ private[execution] final class OffsetWindowFunctionFrame( | |
| if (inputIndex >= 0 && inputIndex < input.size) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a more general comment, which does not necessarily apply to this line. Since we are breaking the code up into to separate code paths (with row/without row), we might as well get rid of the joined row and the logic needed to set this up (like:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, we can improve this part in master.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, lets improve this in a follow-up PR :). |
||
| val r = input.next() | ||
| join(r, current) | ||
| projection(join) | ||
| } else { | ||
| join(emptyRow, current) | ||
| // Use default values since the offset row does not exist. | ||
| fillDefaultValue(join) | ||
| } | ||
| projection(join) | ||
| inputIndex += 1 | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,21 +15,20 @@ | |
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.hive.execution | ||
| package org.apache.spark.sql.execution | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, QueryTest, Row} | ||
| import org.apache.spark.sql.hive.test.TestHiveSingleton | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
|
|
||
| import org.apache.spark.sql.test.SharedSQLContext | ||
|
|
||
| case class WindowData(month: Int, area: String, product: Int) | ||
|
|
||
|
|
||
| /** | ||
| * Test suite for SQL window functions. | ||
| */ | ||
| class SQLWindowFunctionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { | ||
| import spark.implicits._ | ||
| class SQLWindowFunctionSuite extends QueryTest with SharedSQLContext { | ||
|
|
||
| import testImplicits._ | ||
|
|
||
| test("window function: udaf with aggregate expression") { | ||
| val data = Seq( | ||
|
|
@@ -357,14 +356,59 @@ class SQLWindowFunctionSuite extends QueryTest with SQLTestUtils with TestHiveSi | |
| } | ||
|
|
||
| test("SPARK-7595: Window will cause resolve failed with self join") { | ||
| sql("SELECT * FROM src") // Force loading of src table. | ||
|
|
||
| checkAnswer(sql( | ||
| """ | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this test, I disabled the fix (https://github.com/apache/spark/pull/6114/files) and checked that it does fail the analysis because analyzer fails to resolve conflicting references in Join. So, this test is still valid after my change.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but why we remove it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is not in Hive. So there is no table called |
||
| |with | ||
| | v1 as (select key, count(value) over (partition by key) cnt_val from src), | ||
| | v0 as (select 0 as key, 1 as value), | ||
| | v1 as (select key, count(value) over (partition by key) cnt_val from v0), | ||
| | v2 as (select v1.key, v1_lag.cnt_val from v1, v1 v1_lag where v1.key = v1_lag.key) | ||
| | select * from v2 order by key limit 1 | ||
| """.stripMargin), Row(0, 3)) | ||
| | select key, cnt_val from v2 order by key limit 1 | ||
| """.stripMargin), Row(0, 1)) | ||
| } | ||
|
|
||
| test("SPARK-16633: lead/lag should return the default value if the offset row does not exist") { | ||
| checkAnswer(sql( | ||
| """ | ||
| |SELECT | ||
| | lag(123, 100, 321) OVER (ORDER BY id) as lag, | ||
| | lead(123, 100, 321) OVER (ORDER BY id) as lead | ||
| |FROM (SELECT 1 as id) tmp | ||
| """.stripMargin), | ||
| Row(321, 321)) | ||
|
|
||
| checkAnswer(sql( | ||
| """ | ||
| |SELECT | ||
| | lag(123, 100, a) OVER (ORDER BY id) as lag, | ||
| | lead(123, 100, a) OVER (ORDER BY id) as lead | ||
| |FROM (SELECT 1 as id, 2 as a) tmp | ||
| """.stripMargin), | ||
| Row(2, 2)) | ||
| } | ||
|
|
||
| test("lead/lag should respect null values") { | ||
| checkAnswer(sql( | ||
| """ | ||
| |SELECT | ||
| | b, | ||
| | lag(a, 1, 321) OVER (ORDER BY b) as lag, | ||
| | lead(a, 1, 321) OVER (ORDER BY b) as lead | ||
| |FROM (SELECT cast(null as int) as a, 1 as b | ||
| | UNION ALL | ||
| | select cast(null as int) as id, 2 as b) tmp | ||
| """.stripMargin), | ||
| Row(1, 321, null) :: Row(2, null, 321) :: Nil) | ||
|
|
||
| checkAnswer(sql( | ||
| """ | ||
| |SELECT | ||
| | b, | ||
| | lag(a, 1, c) OVER (ORDER BY b) as lag, | ||
| | lead(a, 1, c) OVER (ORDER BY b) as lead | ||
| |FROM (SELECT cast(null as int) as a, 1 as b, 3 as c | ||
| | UNION ALL | ||
| | select cast(null as int) as id, 2 as b, 4 as c) tmp | ||
| """.stripMargin), | ||
| Row(1, 3, null) :: Row(2, null, 4) :: Nil) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we want to keep the behavioral change, we can make this the same as the original
projectand revert https://github.com/apache/spark/pull/14284/files#diff-4a8f00ca33a80744965463dcc6662c75R351.