[SPARK-29600][SQL] ArrayContains function may return incorrect result for DecimalType

amanomer · cloud-fan · commit 297f406425d4 · 2019-12-18T01:30:28.000+08:00
### What changes were proposed in this pull request? Use `TypeCoercion.findWiderTypeForTwo()` instead of `TypeCoercion.findTightestCommonType()` while preprocessing `inputTypes` in `ArrayContains`. ### Why are the changes needed? `TypeCoercion.findWiderTypeForTwo()` also handles cases for DecimalType. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Test cases to be added. Closes #26811 from amanomer/29600. Authored-by: Aman Omer <amanomer1996@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -243,7 +243,7 @@ object TypeCoercion {
    * string. If the wider decimal type exceeds system limitation, this rule will truncate
    * the decimal type before return it.
    */
-  private[analysis] def findWiderTypeWithoutStringPromotionForTwo(
+  private[catalyst] def findWiderTypeWithoutStringPromotionForTwo(
       t1: DataType,
       t2: DataType): Option[DataType] = {
     findTightestCommonType(t1, t2)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1081,7 +1081,7 @@ case class ArrayContains(left: Expression, right: Expression)
     (left.dataType, right.dataType) match {
       case (_, NullType) => Seq.empty
       case (ArrayType(e1, hasNull), e2) =>
-        TypeCoercion.findTightestCommonType(e1, e2) match {
+        TypeCoercion.findWiderTypeWithoutStringPromotionForTwo(e1, e2) match {
           case Some(dt) => Seq(ArrayType(dt, hasNull), dt)
           case _ => Seq.empty
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -850,7 +850,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     val errorMsg1 =
       s"""
          |Input to function array_contains should have been array followed by a
-         |value with same element type, but it's [array<int>, decimal(29,29)].
+         |value with same element type, but it's [array<int>, decimal(38,29)].
        """.stripMargin.replace("\n", " ").trim()
     assert(e1.message.contains(errorMsg1))
 
@@ -865,6 +865,23 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     assert(e2.message.contains(errorMsg2))
   }
 
+  test("SPARK-29600: ArrayContains function may return incorrect result for DecimalType") {
+    checkAnswer(
+      sql("select array_contains(array(1.10), 1.1)"),
+      Seq(Row(true))
+    )
+
+    checkAnswer(
+      sql("SELECT array_contains(array(1.1), 1.10)"),
+      Seq(Row(true))
+    )
+
+    checkAnswer(
+      sql("SELECT array_contains(array(1.11), 1.1)"),
+      Seq(Row(false))
+    )
+  }
+
   test("arrays_overlap function") {
     val df = Seq(
       (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))),

Original file line number	Diff line number	Diff line change
`@@ -1081,7 +1081,7 @@ case class ArrayContains(left: Expression, right: Expression)`
`1081`	`1081`	`(left.dataType, right.dataType) match {`
`1082`	`1082`	`case (_, NullType) => Seq.empty`
`1083`	`1083`	`case (ArrayType(e1, hasNull), e2) =>`
`1084`		`- TypeCoercion.findTightestCommonType(e1, e2) match {`
	`1084`	`+ TypeCoercion.findWiderTypeWithoutStringPromotionForTwo(e1, e2) match {`
`1085`	`1085`	`case Some(dt) => Seq(ArrayType(dt, hasNull), dt)`
`1086`	`1086`	`case _ => Seq.empty`
`1087`	`1087`	`}`