[SPARK-17620][SQL] Determine Serde by hive.default.fileformat when Creating Hive Serde Tables #15495

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

dilipbiswal wants to merge 5 commits into apache:master from dilipbiswal:orc2

sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala

-Original file line number
+Diff line change
@@ Expand Up @@
               .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
             outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
               .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-            // Note: Keep this unspecified because we use the presence of the serde to decide
-            // whether to convert a table created by CTAS to a datasource table.
-            serde = None,
+            serde = defaultHiveSerde.flatMap(_.serde),
             compressed = false,
             properties = Map())
         }
@@ Expand Down @@

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala

-Original file line number
+Diff line change
@@ Expand Up / @@ -30,10 +30,12 @@ import org.apache.spark.sql.catalyst.plans.PlanTest @@
     import org.apache.spark.sql.catalyst.plans.logical.{Generate, ScriptTransformation}
     import org.apache.spark.sql.execution.command._
     import org.apache.spark.sql.execution.datasources.CreateTable
-    import org.apache.spark.sql.hive.test.TestHive
+    import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
+    import org.apache.spark.sql.internal.SQLConf
+    import org.apache.spark.sql.test.SQLTestUtils
     import org.apache.spark.sql.types.StructType
-    class HiveDDLCommandSuite extends PlanTest {
+    class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingleton {
       val parser = TestHive.sessionState.sqlParser
       private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
@@ Expand Down Expand Up / @@ -556,4 +558,24 @@ class HiveDDLCommandSuite extends PlanTest { @@
         assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
       }
+      test("Test the default fileformat for Hive-serde tables") {
+        withSQLConf("hive.default.fileformat" -> "orc") {
+          val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+          assert(exists)
+          assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+          assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+          assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+        }
+        withSQLConf("hive.default.fileformat" -> "parquet") {
+          val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+          assert(exists)
+          val input = desc.storage.inputFormat
+          val output = desc.storage.outputFormat
+          val serde = desc.storage.serde
+          assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+          assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+          assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+        }
+       }
     }

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

-Original file line number
+Diff line change
@@ Expand Up @@
       def checkRelation(
           tableName: String,
-          isDataSourceParquet: Boolean,
+          isDataSourceTable: Boolean,
           format: String,
           userSpecifiedLocation: Option[String] = None): Unit = {
         val relation = EliminateSubqueryAliases(
@@ Expand All @@
           sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
         relation match {
           case LogicalRelation(r: HadoopFsRelation, _, _) =>
-            if (!isDataSourceParquet) {
+            if (!isDataSourceTable) {
               fail(
                 s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
                   s"${HadoopFsRelation.getClass.getCanonicalName}.")
@@ Expand All @@
             assert(catalogTable.provider.get === format)
           case r: MetastoreRelation =>
-            if (isDataSourceParquet) {
+            if (isDataSourceTable) {
               fail(
                 s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
                   s"${classOf[MetastoreRelation].getCanonicalName}.")
@@ Expand All @@
                 assert(r.catalogTable.storage.locationUri.get === location)
               case None => // OK.
             }
-            // Also make sure that the format is the desired format.
+            // Also make sure that the format and serde are as desired.
             assert(catalogTable.storage.inputFormat.get.toLowerCase.contains(format))
+            assert(catalogTable.storage.outputFormat.get.toLowerCase.contains(format))
+            val serde = catalogTable.storage.serde.get
+            format match {
+              case "sequence" | "text" => assert(serde.contains("LazySimpleSerDe"))
+              case "rcfile" => assert(serde.contains("LazyBinaryColumnarSerDe"))
+              case _ => assert(serde.toLowerCase.contains(format))
+            }
         }
         // When a user-specified location is defined, the table type needs to be EXTERNAL.
@@ Expand Down Expand Up @@
         }
       }
+      test("CTAS with default fileformat") {
+        val table = "ctas1"
+        val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src"
+        withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+          withSQLConf("hive.default.fileformat" -> "textfile") {
+            withTable(table) {
+              sql(ctas)
+              // We should use parquet here as that is the default datasource fileformat. The default
+              // datasource file format is controlled by `spark.sql.sources.default` configuration.
+              // This testcase verifies that setting `hive.default.fileformat` has no impact on
+              // the target table's fileformat in case of CTAS.
+              assert(sessionState.conf.defaultDataSourceName === "parquet")
+              checkRelation(tableName = table, isDataSourceTable = true, format = "parquet")
+            }
+          }
+          withSQLConf("spark.sql.sources.default" -> "orc") {
+            withTable(table) {
+              sql(ctas)
+              checkRelation(tableName = table, isDataSourceTable = true, format = "orc")
+             }
+          }
+        }
+      }
       test("CTAS without serde with location") {
         withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
           withTempDir { dir =>
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SPARK-17620][SQL] Determine Serde by hive.default.fileformat when Creating Hive Serde Tables #15495

Uh oh!

Diff view

Diff view

There are no files selected for viewing

dilipbiswal Oct 14, 2016

Uh oh!

viirya Oct 15, 2016

Uh oh!

dilipbiswal Oct 15, 2016 •

edited

Loading

Uh oh!

viirya Oct 15, 2016

Uh oh!

viirya Oct 16, 2016

Uh oh!

yhuai Oct 17, 2016

Uh oh!

[SPARK-17620][SQL] Determine Serde by hive.default.fileformat when Creating Hive Serde Tables #15495

Uh oh!

[SPARK-17620][SQL] Determine Serde by hive.default.fileformat when Creating Hive Serde Tables #15495

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

dilipbiswal Oct 14, 2016

Choose a reason for hiding this comment

Uh oh!

viirya Oct 15, 2016

Choose a reason for hiding this comment

Uh oh!

dilipbiswal Oct 15, 2016 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

viirya Oct 15, 2016

Choose a reason for hiding this comment

Uh oh!

viirya Oct 16, 2016

Choose a reason for hiding this comment

Uh oh!

yhuai Oct 17, 2016

Choose a reason for hiding this comment

Uh oh!

dilipbiswal Oct 15, 2016 •

edited

Loading