-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-28746][SQL] Add partitionby hint for sql queries #25464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b259859
33a8b0e
5c7e871
ebe9220
4f06914
bed289e
67ce5d5
c204ce3
c57805b
4fbf34a
b0ae689
18ed232
5cc98b7
4182994
62f8c4f
8af48d3
507735c
e5eb369
44aa0b8
4ac7eb6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,7 @@ import java.util.Locale | |
| import scala.collection.mutable | ||
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.expressions.IntegerLiteral | ||
| import org.apache.spark.sql.catalyst.expressions.{Ascending, Expression, IntegerLiteral, SortOrder} | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.catalyst.trees.CurrentOrigin | ||
|
|
@@ -137,31 +137,101 @@ object ResolveHints { | |
| } | ||
|
|
||
| /** | ||
| * COALESCE Hint accepts name "COALESCE" and "REPARTITION". | ||
| * Its parameter includes a partition number. | ||
| * COALESCE Hint accepts names "COALESCE", "REPARTITION", and "REPARTITION_BY_RANGE". | ||
| */ | ||
| object ResolveCoalesceHints extends Rule[LogicalPlan] { | ||
| private val COALESCE_HINT_NAMES = Set("COALESCE", "REPARTITION") | ||
| class ResolveCoalesceHints(conf: SQLConf) extends Rule[LogicalPlan] { | ||
|
|
||
| /** | ||
| * This function handles hints for "COALESCE" and "REPARTITION". | ||
| * The "COALESCE" hint only has a partition number as a parameter. The "REPARTITION" hint | ||
| * has a partition number, columns, or both of them as parameters. | ||
| */ | ||
| private def createRepartition( | ||
| shuffle: Boolean, hint: UnresolvedHint): LogicalPlan = { | ||
| val hintName = hint.name.toUpperCase(Locale.ROOT) | ||
|
|
||
| def createRepartitionByExpression( | ||
| numPartitions: Int, partitionExprs: Seq[Any]): RepartitionByExpression = { | ||
| val sortOrders = partitionExprs.filter(_.isInstanceOf[SortOrder]) | ||
| if (sortOrders.nonEmpty) throw new IllegalArgumentException( | ||
| s"""Invalid partitionExprs specified: $sortOrders | ||
| |For range partitioning use REPARTITION_BY_RANGE instead. | ||
| """.stripMargin) | ||
| val invalidParams = partitionExprs.filter(!_.isInstanceOf[UnresolvedAttribute]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this check breaks the old API, in Spark 2.4 it is possible to use an expression here. I think we need to back this out.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK 2.4 only supports something like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, I think so, too.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok, you are right. Never the less I think we should support expressions for REPARTITION here. |
||
| if (invalidParams.nonEmpty) { | ||
| throw new AnalysisException(s"$hintName Hint parameter should include columns, but " + | ||
| s"${invalidParams.mkString(", ")} found") | ||
| } | ||
| RepartitionByExpression( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we then consistently throw an exception like val sortOrders = partitionExprs.filter(_.expr.isInstanceOf[SortOrder])
if (sortOrders.nonEmpty) throw new IllegalArgumentException(
s"""Invalid partitionExprs specified: $sortOrders
|For range partitioning use repartitionByRange(...) instead.
""".stripMargin)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, add an IllegalArgumentException check. |
||
| partitionExprs.map(_.asInstanceOf[Expression]), hint.child, numPartitions) | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case h: UnresolvedHint if COALESCE_HINT_NAMES.contains(h.name.toUpperCase(Locale.ROOT)) => | ||
| val hintName = h.name.toUpperCase(Locale.ROOT) | ||
| val shuffle = hintName match { | ||
| case "REPARTITION" => true | ||
| case "COALESCE" => false | ||
| hint.parameters match { | ||
| case Seq(IntegerLiteral(numPartitions)) => | ||
| Repartition(numPartitions, shuffle, hint.child) | ||
| case Seq(numPartitions: Int) => | ||
| Repartition(numPartitions, shuffle, hint.child) | ||
| // The "COALESCE" hint (shuffle = false) must have a partition number only | ||
| case _ if !shuffle => | ||
| throw new AnalysisException(s"$hintName Hint expects a partition number as a parameter") | ||
|
|
||
| case param @ Seq(IntegerLiteral(numPartitions), _*) if shuffle => | ||
| createRepartitionByExpression(numPartitions, param.tail) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, It's
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I tried to keep consistent with |
||
| case param @ Seq(numPartitions: Int, _*) if shuffle => | ||
| createRepartitionByExpression(numPartitions, param.tail) | ||
| case param @ Seq(_*) if shuffle => | ||
| createRepartitionByExpression(conf.numShufflePartitions, param) | ||
| } | ||
| } | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about the case, |
||
| /** | ||
| * This function handles hints for "REPARTITION_BY_RANGE". | ||
| * The "REPARTITION_BY_RANGE" hint must have column names and a partition number is optional. | ||
| */ | ||
| private def createRepartitionByRange(hint: UnresolvedHint): RepartitionByExpression = { | ||
| val hintName = hint.name.toUpperCase(Locale.ROOT) | ||
|
|
||
| def createRepartitionByExpression( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated is in order to make method clearly for
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, both inner function and duplication are discouraged but okay. |
||
| numPartitions: Int, partitionExprs: Seq[Any]): RepartitionByExpression = { | ||
| val invalidParams = partitionExprs.filter(!_.isInstanceOf[UnresolvedAttribute]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above. |
||
| if (invalidParams.nonEmpty) { | ||
| throw new AnalysisException(s"$hintName Hint parameter should include columns, but " + | ||
| s"${invalidParams.mkString(", ")} found") | ||
| } | ||
| val numPartitions = h.parameters match { | ||
| case Seq(IntegerLiteral(numPartitions)) => | ||
| numPartitions | ||
| case Seq(numPartitions: Int) => | ||
| numPartitions | ||
| case _ => | ||
| throw new AnalysisException(s"$hintName Hint expects a partition number as parameter") | ||
| val sortOrder = partitionExprs.map { | ||
| case expr: SortOrder => expr | ||
| case expr: Expression => SortOrder(expr, Ascending) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC this hint cannot accept all the |
||
| } | ||
| RepartitionByExpression(sortOrder, hint.child, numPartitions) | ||
| } | ||
|
|
||
| hint.parameters match { | ||
| case param @ Seq(IntegerLiteral(numPartitions), _*) => | ||
| createRepartitionByExpression(numPartitions, param.tail) | ||
| case param @ Seq(numPartitions: Int, _*) => | ||
| createRepartitionByExpression(numPartitions, param.tail) | ||
| case param @ Seq(_*) => | ||
| createRepartitionByExpression(conf.numShufflePartitions, param) | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case hint @ UnresolvedHint(hintName, _, _) => hintName.toUpperCase(Locale.ROOT) match { | ||
| case "REPARTITION" => | ||
| createRepartition(shuffle = true, hint) | ||
| case "COALESCE" => | ||
| createRepartition(shuffle = false, hint) | ||
| case "REPARTITION_BY_RANGE" => | ||
| createRepartitionByRange(hint) | ||
| case _ => plan | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't we return
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes. I will make a followup. Thanks for catching this. |
||
| } | ||
| Repartition(numPartitions, shuffle, h.child) | ||
| } | ||
| } | ||
|
|
||
| object ResolveCoalesceHints { | ||
| val COALESCE_HINT_NAMES: Set[String] = Set("COALESCE", "REPARTITION", "REPARTITION_BY_RANGE") | ||
| } | ||
|
|
||
| /** | ||
| * Removes all the hints, used to remove invalid hints provided by the user. | ||
| * This must be executed after all the other hint rules are executed. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Style, please put this inside curly braces and on a new line.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ulysses-you Could you do follow-up for the two comments from @hvanhovell ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually it might be better to just handle this later when we happen to touch this codes given that we don't usually make followups for minor styles issues.