[SPARK-16482][SQL] Describe Table Command for Tables Requiring Runtime Inferred Schema

gatorsmile · yhuai · commit 550d0e7dc633 · 2016-07-13T15:23:59.000-07:00
#### What changes were proposed in this pull request? If we create a table pointing to a parquet/json datasets without specifying the schema, describe table command does not show the schema at all. It only shows `# Schema of this table is inferred at runtime`. In 1.6, describe table does show the schema of such a table. ~~For data source tables, to infer the schema, we need to load the data source tables at runtime. Thus, this PR calls the function `lookupRelation`.~~ For data source tables, we infer the schema before table creation. Thus, this PR set the inferred schema as the table schema when table creation. #### How was this patch tested? Added test cases Author: gatorsmile <gatorsmile@gmail.com> Closes #14148 from gatorsmile/describeSchema. (cherry picked from commit c5ec879) Signed-off-by: Yin Huai <yhuai@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -413,38 +413,36 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF
     } else {
       val metadata = catalog.getTableMetadata(table)
 
+      if (DDLUtils.isDatasourceTable(metadata)) {
+        DDLUtils.getSchemaFromTableProperties(metadata) match {
+          case Some(userSpecifiedSchema) => describeSchema(userSpecifiedSchema, result)
+          case None => describeSchema(catalog.lookupRelation(table).schema, result)
+        }
+      } else {
+        describeSchema(metadata.schema, result)
+      }
+
       if (isExtended) {
         describeExtended(metadata, result)
       } else if (isFormatted) {
         describeFormatted(metadata, result)
       } else {
-        describe(metadata, result)
+        describePartitionInfo(metadata, result)
       }
     }
 
     result
   }
 
-  // Shows data columns and partitioned columns (if any)
-  private def describe(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
+  private def describePartitionInfo(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
     if (DDLUtils.isDatasourceTable(table)) {
-      val schema = DDLUtils.getSchemaFromTableProperties(table)
-
-      if (schema.isEmpty) {
-        append(buffer, "# Schema of this table is inferred at runtime", "", "")
-      } else {
-        schema.foreach(describeSchema(_, buffer))
-      }
-
       val partCols = DDLUtils.getPartitionColumnsFromTableProperties(table)
       if (partCols.nonEmpty) {
         append(buffer, "# Partition Information", "", "")
         append(buffer, s"# ${output.head.name}", "", "")
         partCols.foreach(col => append(buffer, col, "", ""))
       }
     } else {
-      describeSchema(table.schema, buffer)
-
       if (table.partitionColumns.nonEmpty) {
         append(buffer, "# Partition Information", "", "")
         append(buffer, s"# ${output.head.name}", output(1).name, output(2).name)
@@ -454,14 +452,14 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF
   }
 
   private def describeExtended(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
-    describe(table, buffer)
+    describePartitionInfo(table, buffer)
 
     append(buffer, "", "", "")
     append(buffer, "# Detailed Table Information", table.toString, "")
   }
 
   private def describeFormatted(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
-    describe(table, buffer)
+    describePartitionInfo(table, buffer)
 
     append(buffer, "", "", "")
     append(buffer, "# Detailed Table Information", "", "")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -612,15 +612,17 @@ class HiveDDLSuite
   }
 
   test("desc table for data source table - no user-defined schema") {
-    withTable("t1") {
-      withTempPath { dir =>
-        val path = dir.getCanonicalPath
-        spark.range(1).write.parquet(path)
-        sql(s"CREATE TABLE t1 USING parquet OPTIONS (PATH '$path')")
+    Seq("parquet", "json", "orc").foreach { fileFormat =>
+      withTable("t1") {
+        withTempPath { dir =>
+          val path = dir.getCanonicalPath
+          spark.range(1).write.format(fileFormat).save(path)
+          sql(s"CREATE TABLE t1 USING $fileFormat OPTIONS (PATH '$path')")
 
-        val desc = sql("DESC FORMATTED t1").collect().toSeq
+          val desc = sql("DESC FORMATTED t1").collect().toSeq
 
-        assert(desc.contains(Row("# Schema of this table is inferred at runtime", "", "")))
+          assert(desc.contains(Row("id", "bigint", "")))
+        }
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -612,15 +612,17 @@ class HiveDDLSuite`
`612`	`612`	`}`
`613`	`613`
`614`	`614`	`test("desc table for data source table - no user-defined schema") {`
`615`		`- withTable("t1") {`
`616`		`- withTempPath { dir =>`
`617`		`- val path = dir.getCanonicalPath`
`618`		`- spark.range(1).write.parquet(path)`
`619`		`- sql(s"CREATE TABLE t1 USING parquet OPTIONS (PATH '$path')")`
	`615`	`+ Seq("parquet", "json", "orc").foreach { fileFormat =>`
	`616`	`+ withTable("t1") {`
	`617`	`+ withTempPath { dir =>`
	`618`	`+ val path = dir.getCanonicalPath`
	`619`	`+ spark.range(1).write.format(fileFormat).save(path)`
	`620`	`+ sql(s"CREATE TABLE t1 USING $fileFormat OPTIONS (PATH '$path')")`
`620`	`621`
`621`		`- val desc = sql("DESC FORMATTED t1").collect().toSeq`
	`622`	`+ val desc = sql("DESC FORMATTED t1").collect().toSeq`
`622`	`623`
`623`		`- assert(desc.contains(Row("# Schema of this table is inferred at runtime", "", "")))`
	`624`	`+ assert(desc.contains(Row("id", "bigint", "")))`
	`625`	`+ }`
`624`	`626`	`}`
`625`	`627`	`}`
`626`	`628`	`}`