-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-32220][SQL]SHUFFLE_REPLICATE_NL Hint should not change Non-Cartesian Product join result #29035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-32220][SQL]SHUFFLE_REPLICATE_NL Hint should not change Non-Cartesian Product join result #29035
Changes from all commits
5f1269f
27b41e5
aeccc25
36ff4ca
a65d332
d9b4dcd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -159,7 +159,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { | |
| // 4. Pick cartesian product if join type is inner like. | ||
| // 5. Pick broadcast nested loop join as the final solution. It may OOM but we don't have | ||
| // other choice. | ||
| case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, hint) => | ||
| case p @ ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, hint) => | ||
| def createBroadcastHashJoin(onlyLookingAtHint: Boolean) = { | ||
| getBroadcastBuildSide(left, right, joinType, hint, onlyLookingAtHint, conf).map { | ||
| buildSide => | ||
|
|
@@ -199,7 +199,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { | |
|
|
||
| def createCartesianProduct() = { | ||
| if (joinType.isInstanceOf[InnerLike]) { | ||
| Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), condition))) | ||
| Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), p.condition))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to write a comment above this line and explain what it is doing.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Raise a PR #29084 |
||
| } else { | ||
| None | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -570,4 +570,31 @@ class JoinHintSuite extends PlanTest with SharedSparkSession with AdaptiveSparkP | |
| assert(joinHints == expectedHints) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-32220: Non Cartesian Product Join Result Correct with SHUFFLE_REPLICATE_NL hint") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, is this a correctness issue, @AngersZhuuuu ?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, I think so. Nice catch, @AngersZhuuuu
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, when I try new join hint, I found this result is non-correct. |
||
| withTempView("t1", "t2") { | ||
| Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1") | ||
| Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2") | ||
| val df1 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key = t2.key") | ||
| val df2 = sql("SELECT * from t1 join t2 ON t1.key = t2.key") | ||
| assert(df1.collect().size == df2.collect().size) | ||
|
|
||
| val df3 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2") | ||
| val df4 = sql("SELECT * from t1 join t2") | ||
| assert(df3.collect().size == df4.collect().size) | ||
|
|
||
| val df5 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < t2.key") | ||
| val df6 = sql("SELECT * from t1 join t2 ON t1.key < t2.key") | ||
| assert(df5.collect().size == df6.collect().size) | ||
|
|
||
| val df7 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < 2") | ||
| val df8 = sql("SELECT * from t1 join t2 ON t1.key < 2") | ||
| assert(df7.collect().size == df8.collect().size) | ||
|
|
||
|
|
||
| val df9 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t2.key < 2") | ||
| val df10 = sql("SELECT * from t1 join t2 ON t2.key < 2") | ||
| assert(df9.collect().size == df10.collect().size) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid making the similar mistakes, we need to rename
conditionto a self-descriptive name. "otherConditions"? It is a little bit hard to name it TBH