-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21479][SQL] Outer join filter pushdown in null supplying table when condition is on one of the joined columns #20816
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac17976
1c5dedb
b10879f
7fe9329
194e6e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -637,8 +637,11 @@ object CollapseWindow extends Rule[LogicalPlan] { | |
| * constraints. These filters are currently inserted to the existing conditions in the Filter | ||
| * operators and on either side of Join operators. | ||
| * | ||
| * Note: While this optimization is applicable to all types of join, it primarily benefits Inner and | ||
| * LeftSemi joins. | ||
| * In addition, for left/right outer joins, infer predicate from the preserved side of the Join | ||
| * operator and push the inferred filter over to the null-supplying side. For example, if the | ||
| * preserved side has constraints of the form 'a > 5' and the join condition is 'a = b', in | ||
| * which 'b' is an attribute from the null-supplying side, a [[Filter]] operator of 'b > 5' will | ||
| * be applied to the null-supplying side. | ||
| */ | ||
| object InferFiltersFromConstraints extends Rule[LogicalPlan] with PredicateHelper { | ||
|
|
||
|
|
@@ -671,11 +674,42 @@ object InferFiltersFromConstraints extends Rule[LogicalPlan] with PredicateHelpe | |
| val newConditionOpt = conditionOpt match { | ||
| case Some(condition) => | ||
| val newFilters = additionalConstraints -- splitConjunctivePredicates(condition) | ||
| if (newFilters.nonEmpty) Option(And(newFilters.reduce(And), condition)) else None | ||
| if (newFilters.nonEmpty) Option(And(newFilters.reduce(And), condition)) else conditionOpt | ||
| case None => | ||
| additionalConstraints.reduceOption(And) | ||
| } | ||
| if (newConditionOpt.isDefined) Join(left, right, joinType, newConditionOpt) else join | ||
| // Infer filter for left/right outer joins | ||
| val newLeftOpt = joinType match { | ||
| case RightOuter if newConditionOpt.isDefined => | ||
| val inferredConstraints = left.getRelevantConstraints( | ||
| left.constraints | ||
| .union(right.constraints) | ||
| .union(splitConjunctivePredicates(newConditionOpt.get).toSet)) | ||
| val newFilters = inferredConstraints | ||
| .filterNot(left.constraints.contains) | ||
| .reduceLeftOption(And) | ||
| newFilters.map(Filter(_, left)) | ||
| case _ => None | ||
| } | ||
| val newRightOpt = joinType match { | ||
| case LeftOuter if newConditionOpt.isDefined => | ||
| val inferredConstraints = right.getRelevantConstraints( | ||
| right.constraints | ||
| .union(left.constraints) | ||
| .union(splitConjunctivePredicates(newConditionOpt.get).toSet)) | ||
| val newFilters = inferredConstraints | ||
| .filterNot(right.constraints.contains) | ||
| .reduceLeftOption(And) | ||
| newFilters.map(Filter(_, right)) | ||
| case _ => None | ||
| } | ||
|
|
||
| if ((newConditionOpt.isDefined && (newConditionOpt ne conditionOpt)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have a test case in which
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So here, if
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we change this to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess they do. Only that when |
||
| || newLeftOpt.isDefined || newRightOpt.isDefined) { | ||
| Join(newLeftOpt.getOrElse(left), newRightOpt.getOrElse(right), joinType, newConditionOpt) | ||
| } else { | ||
| join | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,4 +204,40 @@ class InferFiltersFromConstraintsSuite extends PlanTest { | |
| val optimized = Optimize.execute(originalQuery) | ||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("SPARK-21479: Outer join after-join filters push down to null-supplying side") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation.subquery('y) | ||
| val condition = Some("x.a".attr === "y.a".attr) | ||
| val originalQuery = x.join(y, LeftOuter, condition).where("x.a".attr === 2).analyze | ||
| val left = x.where(IsNotNull('a) && 'a === 2) | ||
| val right = y.where(IsNotNull('a) && 'a === 2) | ||
| val correctAnswer = left.join(right, LeftOuter, condition).analyze | ||
| val optimized = Optimize.execute(originalQuery) | ||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("SPARK-21479: Outer join pre-existing filters push down to null-supplying side") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation.subquery('y) | ||
| val condition = Some("x.a".attr === "y.a".attr) | ||
| val originalQuery = x.join(y.where("y.a".attr > 5), RightOuter, condition).analyze | ||
| val left = x.where(IsNotNull('a) && 'a > 5) | ||
| val right = y.where(IsNotNull('a) && 'a > 5) | ||
| val correctAnswer = left.join(right, RightOuter, condition).analyze | ||
| val optimized = Optimize.execute(originalQuery) | ||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("SPARK-21479: Outer join no filter push down to preserved side") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation.subquery('y) | ||
| val condition = Some("x.a".attr === "y.a".attr) | ||
| val originalQuery = x.join(y.where("y.a".attr === 1), LeftOuter, condition).analyze | ||
| val left = x | ||
| val right = y.where(IsNotNull('a) && 'a === 1) | ||
| val correctAnswer = left.join(right, LeftOuter, condition).analyze | ||
| val optimized = Optimize.execute(originalQuery) | ||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the test cases can pass without this new rule.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please verify again. The first two could not pass without this rule, the last one could since it's a counter case.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. Could you change the |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we keep this unchanged? We just
conditionOptin line 681, 685, 693, and 697?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So what would be the benefit of keeping that unchanged? To me, it would make the code look confusing. And in theory the two parts (1. infer
newConditionOpt; 2. infernewLeftOpornewRightOpt) of this optimization rule will be unsynchronized, leaving part 2 always one iteration behind part 1.