[SPARK-21203][SQL] Fix wrong results of insertion of Array of Struct

gatorsmile · cloud-fan · commit ad44ab5cb9cd · 2017-06-24T22:36:14.000+08:00
### What changes were proposed in this pull request? ```SQL CREATE TABLE `tab1` (`custom_fields` ARRAY<STRUCT<`id`: BIGINT, `value`: STRING>>) USING parquet INSERT INTO `tab1` SELECT ARRAY(named_struct('id', 1, 'value', 'a'), named_struct('id', 2, 'value', 'b')) SELECT custom_fields.id, custom_fields.value FROM tab1 ``` The above query always return the last struct of the array, because the rule `SimplifyCasts` incorrectly rewrites the query. The underlying cause is we always use the same `GenericInternalRow` object when doing the cast. ### How was this patch tested? Author: gatorsmile <gatorsmile@gmail.com> Closes #18412 from gatorsmile/castStruct. (cherry picked from commit 2e1586f) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -482,15 +482,15 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       case (fromField, toField) => cast(fromField.dataType, toField.dataType)
     }
     // TODO: Could be faster?
-    val newRow = new GenericInternalRow(from.fields.length)
     buildCast[InternalRow](_, row => {
+      val newRow = new GenericInternalRow(from.fields.length)
       var i = 0
       while (i < row.numFields) {
         newRow.update(i,
           if (row.isNullAt(i)) null else castFuncs(i)(row.get(i, from.apply(i).dataType)))
         i += 1
       }
-      newRow.copy()
+      newRow
     })
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -345,4 +345,25 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       )
     }
   }
+
+  test("SPARK-21203 wrong results of insertion of Array of Struct") {
+    val tabName = "tab1"
+    withTable(tabName) {
+      spark.sql(
+        """
+          |CREATE TABLE `tab1`
+          |(`custom_fields` ARRAY<STRUCT<`id`: BIGINT, `value`: STRING>>)
+          |USING parquet
+        """.stripMargin)
+      spark.sql(
+        """
+          |INSERT INTO `tab1`
+          |SELECT ARRAY(named_struct('id', 1, 'value', 'a'), named_struct('id', 2, 'value', 'b'))
+        """.stripMargin)
+
+      checkAnswer(
+        spark.sql("SELECT custom_fields.id, custom_fields.value FROM tab1"),
+        Row(Array(1, 2), Array("a", "b")))
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -482,15 +482,15 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String`
`482`	`482`	`case (fromField, toField) => cast(fromField.dataType, toField.dataType)`
`483`	`483`	`}`
`484`	`484`	`// TODO: Could be faster?`
`485`		`- val newRow = new GenericInternalRow(from.fields.length)`
`486`	`485`	`buildCast[InternalRow](_, row => {`
	`486`	`+ val newRow = new GenericInternalRow(from.fields.length)`
`487`	`487`	`var i = 0`
`488`	`488`	`while (i < row.numFields) {`
`489`	`489`	`newRow.update(i,`
`490`	`490`	`if (row.isNullAt(i)) null else castFuncs(i)(row.get(i, from.apply(i).dataType)))`
`491`	`491`	`i += 1`
`492`	`492`	`}`
`493`		`- newRow.copy()`
	`493`	`+ newRow`
`494`	`494`	`})`
`495`	`495`	`}`
`496`	`496`