[SPARK-54075][SQL] Make ResolvedCollation evaluable

mihailoale-db · dongjoon-hyun · commit 571b802db743 · 2025-10-29T10:34:23.000-07:00
### What changes were proposed in this pull request? In this PR I propose to make `ResolvedCollation` evaluable. By making `ResolvedCollation` evaluable, it can now pass the `canEvaluateWithinJoin` check, allowing the optimizer to use efficient hash joins for queries with inline COLLATE in join conditions (before this change we would fallback to `BroadcastNestedLoopJoin`). ### Why are the changes needed? To improve performance of specific queries (see added tests). ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52779 from mihailoale-db/makecollationresolvable. Authored-by: mihailoale-db <mihailo.aleksic@databricks.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
@@ -125,14 +125,23 @@ case class UnresolvedCollation(collationName: Seq[String])
 /**
  * An expression that represents a resolved collation name.
  */
-case class ResolvedCollation(collationName: String) extends LeafExpression with Unevaluable {
+case class ResolvedCollation(collationName: String) extends LeafExpression {
   override def nullable: Boolean = false
 
   override def dataType: DataType = StringType(CollationFactory.collationNameToId(collationName))
 
   override def toString: String = collationName
 
   override def sql: String = collationName
+
+  override def eval(input: InternalRow): Any = Literal.create(collationName, dataType).eval(input)
+
+  /** Just a simple passthrough for code generation. */
+  override def genCode(ctx: CodegenContext): ExprCode =
+    Literal.create(collationName, dataType).genCode(ctx)
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    throw SparkException.internalError("ResolvedCollation.doGenCode should not be called.")
+  }
 }
 
 // scalastyle:off line.contains.tab
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
@@ -1865,6 +1865,34 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("inline COLLATE expressions in join conditions should not use nested loop join") {
+    withTable("table1", "table2", "table3") {
+      sql("CREATE TABLE table1 (id STRING, col1 STRING) USING PARQUET")
+      sql("INSERT INTO table1 VALUES ('1', 'a'), ('2', 'b')")
+
+      sql("CREATE TABLE table2 (id STRING, col1 STRING) USING PARQUET")
+      sql("INSERT INTO table2 VALUES ('1', 'a'), ('2', 'b')")
+
+      sql("CREATE TABLE table3 (col1 STRING COLLATE UTF8_LCASE_RTRIM) USING PARQUET")
+      sql("INSERT INTO table3 VALUES ('a'), ('b')")
+
+      val df = sql(
+        """SELECT t1.col1 COLLATE UTF8_LCASE_RTRIM AS result
+          |FROM table1 t1
+          |INNER JOIN table2 t2 ON t2.id = t1.id
+          |INNER JOIN table3 t3 ON t3.col1 = t1.col1 COLLATE UTF8_LCASE_RTRIM
+          |""".stripMargin
+      )
+
+      checkAnswer(df, Seq(Row("a"), Row("b")))
+
+      val queryPlan = df.queryExecution.executedPlan
+      assert(collectFirst(queryPlan) {
+        case _: BroadcastNestedLoopJoinExec => ()
+      }.isEmpty)
+    }
+  }
+
   test("hll sketch aggregate should respect collation") {
     case class HllSketchAggTestCase[R](c: String, result: R)
     val testCases = Seq(