[CARMEL-6243] Handle outer join build side skew (#1088)

wangyum · GitHub Enterprise · commit aa62055bef3d · 2022-10-14T09:45:21.000+08:00
* HandleOuterJoinBuildSideSkew

* fix

* handleOuterJoinBuildSideSkew

* Check optimize tag

* fix

* Update SQLConf.scala

* Update SQLConf.scala
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -3079,6 +3079,23 @@ private[spark] object Utils extends Logging {
       0
     }
   }
+
+  /**
+   * Return the median number of a long array
+   *
+   * @param sizes
+   * @param alreadySorted
+   * @return
+   */
+  def median(sizes: Array[Long], alreadySorted: Boolean): Long = {
+    val len = sizes.length
+    val sortedSize = if (alreadySorted) sizes else sizes.sorted
+    len match {
+      case _ if (len % 2 == 0) =>
+        math.max((sortedSize(len / 2) + sortedSize(len / 2 - 1)) / 2, 1)
+      case _ => math.max(sortedSize(len / 2), 1)
+    }
+  }
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -183,6 +183,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     children.foldLeft(Option.empty[BaseType]) { (l, r) => l.orElse(r.find(f)) }
   }
 
+  /**
+   * Test whether there is [[TreeNode]] satisfies the conditions specified in `f`.
+   * The condition is recursively applied to this node and all of its children (pre-order).
+   */
+  def exists(f: BaseType => Boolean): Boolean = if (f(this)) {
+    true
+  } else {
+    children.exists(_.exists(f))
+  }
+
   /**
    * Runs the given function on this node and then recursively on [[children]].
    * @param f the function to be applied to each node in the tree.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -355,6 +355,20 @@ object SQLConf {
       .checkValue(threshold => threshold >= 0, "The maximum row count must be non-negative.")
       .createWithDefault(0)
 
+  val HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_ENABLED =
+    buildConf("spark.sql.optimizer.handleOuterJoinBuildSideSkew.enabled")
+      .doc("When true, enable handling outer join build side skew.")
+      .version("3.3.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  val HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_THRESHOLD =
+    buildConf("spark.sql.optimizer.handleOuterJoinBuildSideSkewThreshold")
+      .doc("Handle outer join build side skew threshold.")
+      .version("3.3.0")
+      .doubleConf
+      .createWithDefault(200)
+
   val COMPRESS_CACHED = buildConf("spark.sql.inMemoryColumnarStorage.compressed")
     .doc("When set to true Spark SQL will automatically select a compression codec for each " +
       "column based on statistics of the data.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -787,7 +787,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   }
 
   object BasicOperators extends Strategy {
-    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    private def applyLocally(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case d: DataWritingCommand => DataWritingCommandExec(d, planLater(d.query)) :: Nil
       case i: InsertIntoDataSource =>
         InsertIntoDataSourceExec(planLater(i.query), i.overwrite,
@@ -929,6 +929,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         planLater(r.child) :: Nil
       case _ => Nil
     }
+
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = {
+      val sparkPlan = applyLocally(plan)
+      sparkPlan.foreach { p => plan.getOptimizeTags().foreach(p.addOptimizeTag) }
+      sparkPlan
+    }
   }
 
   object CompactDataSourceTable extends Strategy {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala
@@ -32,6 +32,7 @@ class AQEOptimizer(sparkSession: SparkSession) extends RuleExecutor[LogicalPlan]
   private val defaultBatches = Seq(
     Batch("Dynamic Join Selection", Once, DynamicJoinSelection),
     Batch("Adaptive Bloom Filter Join", Once, AdaptiveBloomFilterJoin(sparkSession)),
+    Batch("Handle Outer Join Build Side Skew", Once, HandleOuterJoinBuildSideSkew),
     Batch("Eliminate Join to Empty Relation", Once, EliminateJoinToEmptyRelation),
     Batch("Optimize bloom filter Join", Once, OptimizeBloomFilterJoin)
   )
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/HandleOuterJoinBuildSideSkew.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/HandleOuterJoinBuildSideSkew.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{Alias, BloomFilterMightContain, Expression, Literal, PredicateHelper, ScalarSubquery, XxHash64}
+import org.apache.spark.sql.catalyst.expressions.aggregate.BuildBloomFilter
+import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, JoinSelectionHelper}
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.{Inner, LeftAnti, LeftOuter}
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.Utils
+
+object HandleOuterJoinBuildSideSkew extends Rule[LogicalPlan]
+  with JoinSelectionHelper with PredicateHelper with Logging {
+
+  private def insertPredicate(
+      pruningKeys: Seq[Expression],
+      pruningPlan: LogicalPlan,
+      filteringKey: Seq[Expression],
+      filteringPlan: LogicalPlan): LogicalPlan = {
+    val filteringRowCount = filteringPlan.stats.rowCount.get
+    // To improve build bloom filter performance.
+    val coalesceNum = math.max(math.ceil(filteringRowCount.toDouble / 4000000.0).toInt, 1)
+
+    val bloomFilterAgg =
+      new BuildBloomFilter(new XxHash64(filteringKey),
+        math.max(filteringRowCount.toLong, 1L), true, 0, 0)
+    val alias = Alias(bloomFilterAgg.toAggregateExpression(), "bloomFilter")()
+    val aggregate = ConstantFolding(Aggregate(Nil, Seq(alias),
+      Repartition(coalesceNum, false, filteringPlan)))
+
+    val bloomFilterSubquery = ScalarSubquery(aggregate, Nil)
+    Filter(BloomFilterMightContain(bloomFilterSubquery, new XxHash64(pruningKeys)), pruningPlan)
+  }
+
+  private def containsBloomFilter(plan: LogicalPlan): Boolean = {
+    plan.exists {
+      case Filter(condition, _) =>
+        splitConjunctivePredicates(condition).exists {
+          case _: BloomFilterMightContain => true
+          case _ => false
+        }
+      case _ => false
+    }
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!conf.getConf(SQLConf.HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_ENABLED)) return plan
+
+    plan.transformDown {
+      case join @ ExtractEquiJoinKeys(LeftOuter, leftKeys, rightKeys, _,
+          left @ LogicalQueryStage(_, stage1: ShuffleQueryStageExec),
+          right @ LogicalQueryStage(_, stage2: ShuffleQueryStageExec), _)
+        if stage1.isMaterialized && stage2.isMaterialized &&
+          !canPlanAsBroadcastHashJoin(join, conf) && !containsBloomFilter(right) =>
+        val rightSize = stage2.mapStats.get.bytesByPartitionId
+        val threshold = conf.getConf(SQLConf.HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_THRESHOLD)
+        val maxBloomFilterEntries = conf.dynamicBloomFilterJoinPruningMaxBloomFilterEntries
+
+        if (rightSize.max > Utils.median(rightSize, false) * threshold) {
+          // 1. Insert bloom filter
+          val insertBF = if (left.stats.rowCount.exists(_ <= maxBloomFilterEntries)) {
+            insertPredicate(rightKeys, right, leftKeys, left)
+          } else {
+            right
+          }
+          // TODO: 2. Insert partial aggregate
+          val joinAttrs = join.condition.map(_.references.filter(canEvaluate(_, right)).toSeq)
+            .getOrElse(Nil)
+          val insertPartialAgg =
+            if (joinAttrs.nonEmpty) PartialAggregate(joinAttrs, joinAttrs, insertBF) else insertBF
+
+          // Should not convert to BHJ
+          val joinHint = JoinHint(Some(HintInfo(strategy = Some(NO_BROADCAST_HASH))), None)
+          val union = Union(
+            join.copy(right = insertBF, joinType = Inner, hint = joinHint),
+            Project(left.output ++
+              right.output.map(name => Alias(Literal(null, name.dataType), name.name)()),
+              Join(left, insertBF, LeftAnti, join.condition, join.hint)))
+          union.addOptimizeTag(s"created by ${this.simpleRuleName}")
+          union
+        } else {
+          join
+        }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/HandleOuterJoinBuildSideSkewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/HandleOuterJoinBuildSideSkewSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent}
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.expressions.BloomFilterMightContain
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.exchange.Exchange
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, ShuffledHashJoinExec, SortMergeJoinExec}
+import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class HandleOuterJoinBuildSideSkewSuite
+  extends QueryTest
+    with SharedSparkSession
+    with AdaptiveSparkPlanHelper
+    with PrivateMethodTester {
+
+
+  protected def runAdaptiveAndVerifyResult(query: String): (SparkPlan, SparkPlan) = {
+    var finalPlanCnt = 0
+    val listener = new SparkListener {
+      override def onOtherEvent(event: SparkListenerEvent): Unit = {
+        event match {
+          case SparkListenerSQLAdaptiveExecutionUpdate(_, _, sparkPlanInfo) =>
+            if (sparkPlanInfo.simpleString.startsWith(
+              "AdaptiveSparkPlan isFinalPlan=true")) {
+              finalPlanCnt += 1
+            }
+          case _ => // ignore other events
+        }
+      }
+    }
+    spark.sparkContext.addSparkListener(listener)
+
+    val dfAdaptive = spark.sql(query)
+    val planBefore = dfAdaptive.queryExecution.executedPlan
+    assert(planBefore.toString.startsWith("AdaptiveSparkPlan isFinalPlan=false"))
+    val result = dfAdaptive.collect()
+    withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+      val df = spark.sql(query)
+      checkAnswer(df, result)
+    }
+    val planAfter = dfAdaptive.queryExecution.executedPlan
+    assert(planAfter.toString.startsWith("AdaptiveSparkPlan isFinalPlan=true"))
+    val adaptivePlan = planAfter.asInstanceOf[AdaptiveSparkPlanExec].executedPlan
+
+    spark.sparkContext.listenerBus.waitUntilEmpty()
+    // AQE will post `SparkListenerSQLAdaptiveExecutionUpdate` twice in case of subqueries that
+    // exist out of query stages.
+    val expectedFinalPlanCnt = adaptivePlan.find(_.subqueries.nonEmpty).map(_ => 2).getOrElse(1)
+    assert(finalPlanCnt == expectedFinalPlanCnt)
+    spark.sparkContext.removeSparkListener(listener)
+
+    val exchanges = adaptivePlan.collect {
+      case e: Exchange => e
+    }
+    assert(exchanges.isEmpty, "The final plan should not contain any Exchange node.")
+    (dfAdaptive.queryExecution.sparkPlan, adaptivePlan)
+  }
+
+  private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = {
+    collect(plan) {
+      case j: BroadcastHashJoinExec => j
+    }
+  }
+
+  private def findTopLevelSortMergeJoin(plan: SparkPlan): Seq[SortMergeJoinExec] = {
+    collect(plan) {
+      case j: SortMergeJoinExec => j
+    }
+  }
+
+  private def findTopLevelShuffledHashJoin(plan: SparkPlan): Seq[ShuffledHashJoinExec] = {
+    collect(plan) {
+      case j: ShuffledHashJoinExec => j
+    }
+  }
+
+  private def hasBloomFilterJoin(plan: SparkPlan): Seq[FilterExec] = {
+    collectWithSubqueries(plan) {
+      case f @ FilterExec(e, _) if e.isInstanceOf[BloomFilterMightContain] => f
+    }
+  }
+
+  test("Handle outer join build side skew suite") {
+    withSQLConf(
+      SQLConf.HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_ENABLED.key -> "true",
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+      SQLConf.HANDLE_OUTER_JOIN_BUILD_SIDE_SKEW_THRESHOLD.key -> "5") {
+      withTable("t1", "t2") {
+        spark.range(10).selectExpr("id as a", "id as b", "id as c").write.saveAsTable("t1")
+        spark.range(10000).selectExpr("1 as a", "id as b", "id as c").write.saveAsTable("t2")
+
+        sql("insert into t1 values(null, null, null)")
+        sql("insert into t2 values(null, null, null)")
+
+        val (plan, adaptivePlan) = runAdaptiveAndVerifyResult(
+          "select * from t1 left join t2 on t1.a = t2.a")
+
+        assert(findTopLevelSortMergeJoin(plan).size === 1)
+        assert(hasBloomFilterJoin(plan).size === 0)
+        assert(collect(plan) { case j: UnionExec => j }.size === 0)
+        assert(findTopLevelSortMergeJoin(adaptivePlan).size === 2)
+        assert(hasBloomFilterJoin(adaptivePlan).size === 2)
+        assert(collect(adaptivePlan) { case j: UnionExec => j }.size === 1)
+
+        // Check optimize tag
+        assert(adaptivePlan.toString.contains("created by HandleOuterJoinBuildSideSkew"))
+      }
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -3079,6 +3079,23 @@ private[spark] object Utils extends Logging {`
`3079`	`3079`	`0`
`3080`	`3080`	`}`
`3081`	`3081`	`}`
	`3082`	`+`
	`3083`	`+ /**`
	`3084`	`+ * Return the median number of a long array`
	`3085`	`+ *`
	`3086`	`+ * @param sizes`
	`3087`	`+ * @param alreadySorted`
	`3088`	`+ * @return`
	`3089`	`+ */`
	`3090`	`+ def median(sizes: Array[Long], alreadySorted: Boolean): Long = {`
	`3091`	`+ val len = sizes.length`
	`3092`	`+ val sortedSize = if (alreadySorted) sizes else sizes.sorted`
	`3093`	`+ len match {`
	`3094`	`+ case _ if (len % 2 == 0) =>`
	`3095`	`+ math.max((sortedSize(len / 2) + sortedSize(len / 2 - 1)) / 2, 1)`
	`3096`	`+ case _ => math.max(sortedSize(len / 2), 1)`
	`3097`	`+ }`
	`3098`	`+ }`
`3082`	`3099`	`}`
`3083`	`3100`
`3084`	`3101`	`private[util] object CallerContext extends Logging {`
Original file line number	Diff line number	Diff line change
`@@ -787,7 +787,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {`
`787`	`787`	`}`
`788`	`788`
`789`	`789`	`object BasicOperators extends Strategy {`
`790`		`- def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {`
	`790`	`+ private def applyLocally(plan: LogicalPlan): Seq[SparkPlan] = plan match {`
`791`	`791`	`case d: DataWritingCommand => DataWritingCommandExec(d, planLater(d.query)) :: Nil`
`792`	`792`	`case i: InsertIntoDataSource =>`
`793`	`793`	`InsertIntoDataSourceExec(planLater(i.query), i.overwrite,`
`@@ -929,6 +929,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {`
`929`	`929`	`planLater(r.child) :: Nil`
`930`	`930`	`case _ => Nil`
`931`	`931`	`}`
	`932`	`+`
	`933`	`+ def apply(plan: LogicalPlan): Seq[SparkPlan] = {`
	`934`	`+ val sparkPlan = applyLocally(plan)`
	`935`	`+ sparkPlan.foreach { p => plan.getOptimizeTags().foreach(p.addOptimizeTag) }`
	`936`	`+ sparkPlan`
	`937`	`+ }`
`932`	`938`	`}`
`933`	`939`
`934`	`940`	`object CompactDataSourceTable extends Strategy {`
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ class AQEOptimizer(sparkSession: SparkSession) extends RuleExecutor[LogicalPlan]`
`32`	`32`	`private val defaultBatches = Seq(`
`33`	`33`	`Batch("Dynamic Join Selection", Once, DynamicJoinSelection),`
`34`	`34`	`Batch("Adaptive Bloom Filter Join", Once, AdaptiveBloomFilterJoin(sparkSession)),`
	`35`	`+ Batch("Handle Outer Join Build Side Skew", Once, HandleOuterJoinBuildSideSkew),`
`35`	`36`	`Batch("Eliminate Join to Empty Relation", Once, EliminateJoinToEmptyRelation),`
`36`	`37`	`Batch("Optimize bloom filter Join", Once, OptimizeBloomFilterJoin)`
`37`	`38`	`)`