-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-9634][SPARK-9323][SQL] cleanup unnecessary Aliases in LogicalPlan at the end of analysis #7957
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-9634][SPARK-9323][SQL] cleanup unnecessary Aliases in LogicalPlan at the end of analysis #7957
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -82,7 +82,9 @@ class Analyzer( | |
| HiveTypeCoercion.typeCoercionRules ++ | ||
| extendedResolutionRules : _*), | ||
| Batch("Nondeterministic", Once, | ||
| PullOutNondeterministic) | ||
| PullOutNondeterministic), | ||
| Batch("Cleanup", fixedPoint, | ||
| CleanupAliases) | ||
| ) | ||
|
|
||
| /** | ||
|
|
@@ -146,8 +148,6 @@ class Analyzer( | |
| child match { | ||
| case _: UnresolvedAttribute => u | ||
| case ne: NamedExpression => ne | ||
| case g: GetStructField => Alias(g, g.field.name)() | ||
| case g: GetArrayStructFields => Alias(g, g.field.name)() | ||
| case g: Generator if g.resolved && g.elementTypes.size > 1 => MultiAlias(g, Nil) | ||
| case e if !e.resolved => u | ||
| case other => Alias(other, s"_c$i")() | ||
|
|
@@ -384,9 +384,7 @@ class Analyzer( | |
| case u @ UnresolvedAttribute(nameParts) => | ||
| // Leave unchanged if resolution fails. Hopefully will be resolved next round. | ||
| val result = | ||
| withPosition(u) { | ||
| q.resolveChildren(nameParts, resolver).map(trimUnresolvedAlias).getOrElse(u) | ||
| } | ||
| withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) } | ||
| logDebug(s"Resolving $u to $result") | ||
| result | ||
| case UnresolvedExtractValue(child, fieldExpr) if child.resolved => | ||
|
|
@@ -412,11 +410,6 @@ class Analyzer( | |
| exprs.exists(_.collect { case _: Star => true }.nonEmpty) | ||
| } | ||
|
|
||
| private def trimUnresolvedAlias(ne: NamedExpression) = ne match { | ||
| case UnresolvedAlias(child) => child | ||
| case other => other | ||
| } | ||
|
|
||
| private def resolveSortOrders(ordering: Seq[SortOrder], plan: LogicalPlan, throws: Boolean) = { | ||
| ordering.map { order => | ||
| // Resolve SortOrder in one round. | ||
|
|
@@ -426,7 +419,7 @@ class Analyzer( | |
| try { | ||
| val newOrder = order transformUp { | ||
| case u @ UnresolvedAttribute(nameParts) => | ||
| plan.resolve(nameParts, resolver).map(trimUnresolvedAlias).getOrElse(u) | ||
| plan.resolve(nameParts, resolver).getOrElse(u) | ||
| case UnresolvedExtractValue(child, fieldName) if child.resolved => | ||
| ExtractValue(child, fieldName, resolver) | ||
| } | ||
|
|
@@ -968,3 +961,61 @@ object EliminateSubQueries extends Rule[LogicalPlan] { | |
| case Subquery(_, child) => child | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Cleans up unnecessary Aliases inside the plan. Basically we only need Alias as a top level | ||
| * expression in Project(project list) or Aggregate(aggregate expressions) or | ||
| * Window(window expressions). | ||
| */ | ||
| object CleanupAliases extends Rule[LogicalPlan] { | ||
| private def trimAliases(e: Expression): Expression = { | ||
| var stop = false | ||
| e.transformDown { | ||
| // CreateStruct is a special case, we need to retain its top level Aliases as they decide the | ||
| // name of StructField. We also need to stop transform down this expression, or the Aliases | ||
| // under CreateStruct will be mistakenly trimmed. | ||
| case c: CreateStruct if !stop => | ||
| stop = true | ||
| c.copy(children = c.children.map(trimNonTopLevelAliases)) | ||
| case c: CreateStructUnsafe if !stop => | ||
| stop = true | ||
| c.copy(children = c.children.map(trimNonTopLevelAliases)) | ||
| case Alias(child, _) if !stop => child | ||
| } | ||
| } | ||
|
|
||
| def trimNonTopLevelAliases(e: Expression): Expression = e match { | ||
| case a: Alias => | ||
| Alias(trimAliases(a.child), a.name)(a.exprId, a.qualifiers, a.explicitMetadata) | ||
| case other => trimAliases(other) | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| case Project(projectList, child) => | ||
| val cleanedProjectList = | ||
| projectList.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression]) | ||
| Project(cleanedProjectList, child) | ||
|
|
||
| case Aggregate(grouping, aggs, child) => | ||
| val cleanedAggs = aggs.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression]) | ||
| Aggregate(grouping.map(trimAliases), cleanedAggs, child) | ||
|
|
||
| case w @ Window(projectList, windowExprs, partitionSpec, orderSpec, child) => | ||
| val cleanedWindowExprs = | ||
| windowExprs.map(e => trimNonTopLevelAliases(e).asInstanceOf[NamedExpression]) | ||
| Window(projectList, cleanedWindowExprs, partitionSpec.map(trimAliases), | ||
| orderSpec.map(trimAliases(_).asInstanceOf[SortOrder]), child) | ||
|
|
||
| case other => | ||
| var stop = false | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not for this release, but this makes me think that we are abusing aliases. I would rather that resolved expressions past the analyzer move the names out of the subexpressions and into the |
||
| other transformExpressionsDown { | ||
| case c: CreateStruct if !stop => | ||
| stop = true | ||
| c.copy(children = c.children.map(trimNonTopLevelAliases)) | ||
| case c: CreateStructUnsafe if !stop => | ||
| stop = true | ||
| c.copy(children = c.children.map(trimNonTopLevelAliases)) | ||
| case Alias(child, _) if !stop => child | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -119,4 +119,21 @@ class AnalysisSuite extends AnalysisTest { | |
| Project(testRelation.output :+ projected, testRelation))) | ||
| checkAnalysis(plan, expected) | ||
| } | ||
|
|
||
| test("SPARK-9634: cleanup unnecessary Aliases in LogicalPlan") { | ||
| val a = testRelation.output.head | ||
| var plan = testRelation.select(((a + 1).as("a+1") + 2).as("col")) | ||
| var expected = testRelation.select((a + 1 + 2).as("col")) | ||
| checkAnalysis(plan, expected) | ||
|
|
||
| plan = testRelation.groupBy(a.as("a1").as("a2"))((min(a).as("min_a") + 1).as("col")) | ||
| expected = testRelation.groupBy(a)((min(a) + 1).as("col")) | ||
| checkAnalysis(plan, expected) | ||
|
|
||
| // CreateStruct is a special case that we should not trim Alias for it. | ||
| plan = testRelation.select(CreateStruct(Seq(a, (a + 1).as("a+1"))).as("col")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test |
||
| checkAnalysis(plan, plan) | ||
| plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col")) | ||
| checkAnalysis(plan, plan) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1135,17 +1135,18 @@ class DataFrame private[sql]( | |
| * @group dfops | ||
| * @since 1.3.0 | ||
| */ | ||
| def withColumn(colName: String, col: Column): DataFrame = { | ||
| def withColumn(colName: String, col: Column, metadata: Option[Metadata] = None): DataFrame = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cc @rxin , in MLlib sometimes we need to set metadata for the new column, thus we will alias the new column with metadata before call
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you do this in a different PR? Also we should do it without using |
||
| val resolver = sqlContext.analyzer.resolver | ||
| val replaced = schema.exists(f => resolver(f.name, colName)) | ||
| val aliasedColumn = metadata.map(md => col.as(colName, md)).getOrElse(col.as(colName)) | ||
| if (replaced) { | ||
| val colNames = schema.map { field => | ||
| val name = field.name | ||
| if (resolver(name, colName)) col.as(colName) else Column(name) | ||
| if (resolver(name, colName)) aliasedColumn else Column(name) | ||
| } | ||
| select(colNames : _*) | ||
| } else { | ||
| select(Column("*"), col.as(colName)) | ||
| select(Column("*"), aliasedColumn) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remove everything except for this and the related tests? I'd like to pull this into the release branch without new features.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have opened #8159 to improve the
withColumn, but left the code here to see if we can pass the tests.This PR did 2 things:
Aliasinstead ofUnresolvedAliaswhen resolve nested column inLogicalPlan.resolveIf we only do
1, some tests will fail as we need to trim aliases in the middle of getField chain. If we only do2, it can't fix any bugs. So I put them together here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've open #8215 which is basically your patch without the mllib changes.