Drop PartitionCoalesce and use Reparititon

maropu · maropu · commit 3b4c679a0f16 · 2017-08-07T16:34:48.000+09:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -596,14 +596,15 @@ object CollapseProject extends Rule[LogicalPlan] {
 object CollapseRepartition extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     // Case 1: When a Repartition has a child of Repartition or RepartitionByExpression,
-    // 1) When the top node does not enable the shuffle (i.e., coalesce API), but the child
-    //   enables the shuffle. Returns the child node if the last numPartitions is bigger;
-    //   otherwise, keep unchanged.
+    // 1) When the top node does not enable the shuffle (i.e., coalesce with no user-specified
+    //   strategy), but the child enables the shuffle. Returns the child node if the last
+    //   numPartitions is bigger; otherwise, keep unchanged.
     // 2) In the other cases, returns the top node with the child's child
-    case r @ Repartition(_, _, child: RepartitionOperation) => (r.shuffle, child.shuffle) match {
-      case (false, true) => if (r.numPartitions >= child.numPartitions) child else r
-      case _ => r.copy(child = child.child)
-    }
+    case r @ Repartition(_, _, child: RepartitionOperation, None) =>
+      (r.shuffle, child.shuffle) match {
+        case (false, true) => if (r.numPartitions >= child.numPartitions) child else r
+        case _ => r.copy(child = child.child)
+      }
     // Case 2: When a RepartitionByExpression has a child of Repartition or RepartitionByExpression
     // we can remove the child.
     case r @ RepartitionByExpression(_, child: RepartitionOperation, _) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -746,22 +746,24 @@ abstract class RepartitionOperation extends UnaryNode {
  * [[RepartitionByExpression]] as this method is called directly by DataFrame's, because the user
  * asked for `coalesce` or `repartition`. [[RepartitionByExpression]] is used when the consumer
  * of the output requires some specific ordering or distribution of the data.
+ *
+ * If `shuffle` = false (`coalesce` cases), this logical plan can have an user-specified strategy
+ * to coalesce input partitions.
+ *
+ * @param numPartitions How many partitions to use in the output RDD
+ * @param shuffle Whether to shuffle when repartitioning
+ * @param child the LogicalPlan
+ * @param coalescer Optional coalescer that an user specifies
  */
-case class Repartition(numPartitions: Int, shuffle: Boolean, child: LogicalPlan)
+case class Repartition(
+    numPartitions: Int,
+    shuffle: Boolean,
+    child: LogicalPlan,
+    coalescer: Option[PartitionCoalescer] = None)
   extends RepartitionOperation {
   require(numPartitions > 0, s"Number of partitions ($numPartitions) must be positive.")
 }
 
-/**
- * Returns a new RDD that has at most `numPartitions` partitions. This behavior can be modified by
- * supplying a `PartitionCoalescer` to control the behavior of the partitioning.
- */
-case class PartitionCoalesce(numPartitions: Int, coalescer: PartitionCoalescer, child: LogicalPlan)
-  extends UnaryNode {
-  require(numPartitions > 0, s"Number of partitions ($numPartitions) must be positive.")
-  override def output: Seq[Attribute] = child.output
-}
-
 /**
  * This method repartitions data using [[Expression]]s into `numPartitions`, and receives
  * information about the number of partitions during execution. Used when a specific ordering or
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2680,14 +2680,9 @@ class Dataset[T] private[sql](
    * @group typedrel
    * @since 2.3.0
    */
-  def coalesce(numPartitions: Int, userDefinedCoalescer: Option[PartitionCoalescer]): Dataset[T] = {
-    userDefinedCoalescer.map { coalescer =>
-      withTypedPlan {
-        PartitionCoalesce(numPartitions, coalescer, logicalPlan)
-      }
-    }.getOrElse {
-      coalesce(numPartitions)
-    }
+  def coalesce(numPartitions: Int, userDefinedCoalescer: Option[PartitionCoalescer])
+    : Dataset[T] = withTypedPlan {
+    Repartition(numPartitions, shuffle = false, logicalPlan, userDefinedCoalescer)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -390,14 +390,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           f, key, lObj, rObj, lGroup, rGroup, lAttr, rAttr, oAttr,
           planLater(left), planLater(right)) :: Nil
 
-      case logical.Repartition(numPartitions, shuffle, child) =>
+      case logical.Repartition(numPartitions, shuffle, child, coalescer) =>
         if (shuffle) {
           ShuffleExchange(RoundRobinPartitioning(numPartitions), planLater(child)) :: Nil
         } else {
-          execution.CoalesceExec(numPartitions, planLater(child), None) :: Nil
+          execution.CoalesceExec(numPartitions, planLater(child), coalescer) :: Nil
         }
-      case logical.PartitionCoalesce(numPartitions, coalescer, child) =>
-        execution.CoalesceExec(numPartitions, planLater(child), Some(coalescer)) :: Nil
       case logical.Sort(sortExprs, global, child) =>
         execution.SortExec(sortExprs, global, planLater(child)) :: Nil
       case logical.Project(projectList, child) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -244,7 +244,7 @@ class PlannerSuite extends SharedSQLContext {
     assert(countRepartitions(doubleRepartitioned.queryExecution.logical) === 3)
     assert(countRepartitions(doubleRepartitioned.queryExecution.optimizedPlan) === 2)
     doubleRepartitioned.queryExecution.optimizedPlan match {
-      case Repartition (numPartitions, shuffle, Repartition(_, shuffleChild, _)) =>
+      case Repartition(numPartitions, shuffle, Repartition(_, shuffleChild, _, _), _) =>
         assert(numPartitions === 5)
         assert(shuffle === false)
         assert(shuffleChild === true)