[CARMEL-6586] Ignore SinglePartition when determining expectedChildrenNumPartitions (#1252)

wakun · GitHub Enterprise · commit 9e899c166ceb · 2023-03-13T15:04:32.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -285,7 +285,12 @@ object EnsureRequirements extends Rule[SparkPlan] {
       val nonShuffleChildrenNumPartitions =
         childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec])
           .map(_.outputPartitioning.numPartitions)
-      val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) {
+      val allSinglePartition =
+        childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec])
+          .forall(_.outputPartitioning == SinglePartition)
+      val expectedChildrenNumPartitions = if (allSinglePartition) {
+        conf.numShufflePartitions
+      } else if (nonShuffleChildrenNumPartitions.nonEmpty) {
         if (nonShuffleChildrenNumPartitions.length == childrenIndexes.length) {
           // Here we pick the max number of partitions among these non-shuffle children.
           nonShuffleChildrenNumPartitions.max
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala
@@ -22,9 +22,10 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.internal.config.UI.UI_ENABLED
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.execution.adaptive._
 import org.apache.spark.sql.execution.adaptive.CustomShuffleReaderExec
-import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_NONE, ReusedExchangeExec}
+import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_NONE, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -521,6 +522,63 @@ class CoalesceShufflePartitionsSuite
       withSparkSession(test, 10000, minPartitionNum)
     }
   }
+
+  test(s"Ignore SinglePartition when determining expectedChildrenNumPartitions") {
+    val test: SparkSession => Unit = { spark: SparkSession =>
+      try {
+        val aeqShufflePartitionNum = 100
+        spark.conf.set(SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key,
+          aeqShufflePartitionNum)
+        val df = spark.range(100).toDF("id").selectExpr("id", "id as name")
+        df.write.format("parquet").saveAsTable("t1")
+        df.write.format("parquet").saveAsTable("t2")
+        df.write.bucketBy(10, "id").format("parquet").saveAsTable("t3")
+        df.write.bucketBy(20, "id").format("parquet").saveAsTable("t4")
+
+        /* SinglePartition Join with parquet table */
+        val join1 = spark.sql(
+          s"""SELECT *
+             |FROM (select * from t1 limit 1000) t1
+             |JOIN t2
+             |ON t1.id = t2.id
+             |""".stripMargin)
+        join1.collect()
+        assert(collect(join1.queryExecution.executedPlan) {
+          case r @ ShuffleExchangeExec(HashPartitioning(_, 100), _, _) => r
+        }.length === 2)
+
+        /* Bucket join: Only shuffle parquet table to bucket number */
+        val join2 = spark.sql(
+          s"""SELECT *
+             |FROM t3
+             |JOIN t2
+             |ON t3.id = t2.id
+             |""".stripMargin)
+        join2.collect()
+        assert(collect(join2.queryExecution.executedPlan) {
+          case r @ ShuffleExchangeExec(HashPartitioning(_, 10), _, _) => r
+        }.length === 1)
+
+        /* Two bucket table join: select the smaller bucket number */
+        val join3 = spark.sql(
+          s"""SELECT *
+             |FROM t3
+             |JOIN t4
+             |ON t3.id = t4.id
+             |""".stripMargin)
+        join3.collect()
+        assert(collect(join3.queryExecution.executedPlan) {
+          case r @ ShuffleExchangeExec(HashPartitioning(_, 20), _, _) => r
+        }.length === 1)
+      } finally {
+        Seq("t1", "t2", "t3", "t4").foreach { name =>
+          spark.sql(s"DROP TABLE IF EXISTS $name")
+        }
+      }
+    }
+
+    withSparkSession(test, 10000, None)
+  }
 }
 
 object CoalescedShuffleReader {