From 0ebb0142e13db3ce8fb474ee5682528b0f87d2d2 Mon Sep 17 00:00:00 2001 From: xin Wu Date: Fri, 1 Apr 2016 18:46:16 -0700 Subject: [PATCH 001/109] show create table DDL -- hive metastore table --- .../spark/sql/catalyst/parser/SqlBase.g4 | 1 + .../sql/catalyst/catalog/SessionCatalog.scala | 8 ++ .../sql/catalyst/catalog/interface.scala | 1 + .../spark/sql/execution/SparkSqlParser.scala | 9 ++ .../sql/execution/command/commands.scala | 27 ++++ .../spark/sql/hive/HiveSessionCatalog.scala | 132 +++++++++++++++++- .../sql/hive/client/HiveClientImpl.scala | 1 + 7 files changed, 178 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 85cb585919da..7930f1e7709f 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -118,6 +118,7 @@ statement | SHOW DATABASES (LIKE pattern=STRING)? #showDatabases | SHOW TBLPROPERTIES table=tableIdentifier ('(' key=tablePropertyKey ')')? #showTblProperties + | SHOW CREATE TABLE tableIdentifier | SHOW FUNCTIONS (LIKE? (qualifiedName | pattern=STRING))? #showFunctions | (DESC | DESCRIBE) FUNCTION EXTENDED? qualifiedName #describeFunction | (DESC | DESCRIBE) option=(EXTENDED | FORMATTED)? diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 7db9fd0527ec..283f98ebbc02 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -175,6 +175,14 @@ class SessionCatalog( externalCatalog.getTable(db, table) } + /** + * Generate Create table DDL string for the specified tableIdentifier + */ + def generateTableDDL(name: TableIdentifier): String = { + val catalogTable = this.getTable(name) + "CREATE TABLE " + name.toString + } + // ------------------------------------------------------------- // | Methods that interact with temporary and metastore tables | // ------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index e29d6bd8b09e..e48f837b5fde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -219,6 +219,7 @@ case class CatalogTable( storage: CatalogStorageFormat, schema: Seq[CatalogColumn], partitionColumns: Seq[CatalogColumn] = Seq.empty, + bucketColumns: Seq[String] = Seq.empty, sortColumns: Seq[CatalogColumn] = Seq.empty, numBuckets: Int = 0, createTime: Long = System.currentTimeMillis, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 3de8aa02766d..2403d68b5f52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -108,6 +108,15 @@ class SparkSqlAstBuilder extends AstBuilder { Option(ctx.key).map(visitTablePropertyKey)) } + /** + * Create a [[ShowCreateTableCommand]] logical plan + */ + override def visitShowCreateTable(ctx: ShowCreateTableContext): LogicalPlan = withOrigin(ctx) { + ShowCreateTableCommand( + ctx.tableIdentifier.table.getText, + Option(ctx.tableIdentifier.db).map(_.getText)) + } + /** * Create a [[RefreshTable]] logical plan. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index 3fd2a93d2926..cd1f88359e0b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -422,6 +422,33 @@ case class ShowTablePropertiesCommand( } } +/** + * A command for users to get the DDL of an existing table + * The syntax of using this command in SQL is: + * {{{ + * SHOW CREATE TABLE tableIdentifier + * }}} + */ +case class ShowCreateTableCommand( + tableName: String, + databaseName: Option[String]) + extends RunnableCommand{ + + // The result of SHOW CREATE TABLE is the whole string of DDL command + override val output: Seq[Attribute] = { + val schema = StructType( + StructField("DDL", StringType, nullable = false):: Nil) + schema.toAttributes + } + + override def run(sqlContext: SQLContext): Seq[Row] ={ + val catalog = sqlContext.sessionState.catalog + val db = databaseName.getOrElse(catalog.getCurrentDatabase) + Seq(Row(catalog.generateTableDDL(TableIdentifier(tableName, databaseName)))) + } + +} + /** * A command for users to list all of the registered functions. * The syntax of using this command in SQL is: diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 0cccc22e5a62..9759cdf11ee7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -29,7 +29,7 @@ import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, Gener import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.catalog.{FunctionResourceLoader, SessionCatalog} +import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, FunctionResourceLoader, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.catalyst.rules.Rule @@ -241,6 +241,136 @@ private[sql] class HiveSessionCatalog( val info = new ExpressionInfo(clazz.getCanonicalName, functionName) createTempFunction(functionName, info, builder, ignoreIfExists = false) } + + /** + * Generate Create table DDL string for the specified tableIdentifier + * that is from Hive metastore + */ + override def generateTableDDL(name: TableIdentifier): String = { + val ct = this.getTable(name) + val sb = new StringBuilder("CREATE ") + val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] + + if (ct.tableType == CatalogTableType.VIRTUAL_VIEW) { + sb.append(" VIEW "+ct.qualifiedName+" AS " + ct.viewOriginalText.getOrElse("")) + } else { + // TEMPORARY keyword is not applicable for HIVE DDL from Spark SQL yet. + if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { + processedProperties += "EXTERNAL" + sb.append(" EXTERNAL TABLE " + ct.qualifiedName) + } else { + sb.append(" TABLE " + ct.qualifiedName) + } + // column list + val cols = ct.schema map { col => + col.name + " " + col.dataType + (col.comment.getOrElse("") match { + case cmt: String if cmt.length > 0 => " COMMENT '" + escapeHiveCommand(cmt) + "'" + case _ => "" + }) + // hive ddl does not honor NOT NULL, it is always default to be nullable + } + sb.append(cols.mkString("(", ", ", ")")+"\n") + + // table comment + sb.append(" " + + ct.properties.getOrElse("comment", new String) match { + case tcmt: String if tcmt.trim.length > 0 => + processedProperties += "comment" + " COMMENT '" + escapeHiveCommand(tcmt.trim) + "'\n" + case _ => "" + }) + + // partitions + val partCols = ct.partitionColumns map { col => + col.name + " " + col.dataType + (col.comment.getOrElse("") match { + case cmt: String if cmt.length > 0 => " COMMENT '" + escapeHiveCommand(cmt) + "'" + case _ => "" + }) + } + if (partCols != null && partCols.size > 0) { + sb.append(" PARTITIONED BY ") + sb.append(partCols.mkString("( ", ", ", " )")+"\n") + } + + // sort bucket + if (ct.bucketColumns.size > 0) { + processedProperties += "SORTBUCKETCOLSPREFIX" + sb.append(" CLUSTERED BY ") + sb.append(ct.bucketColumns.mkString("( ", ", ", " )")) + + // TODO sort columns don't have the the right scala types yet. need to adapt to Hive Order + if (ct.sortColumns.size > 0) { + sb.append(" SORTED BY ") + sb.append(ct.sortColumns.map(_.name).mkString("( ", ", ", " )")) + } + sb.append(" INTO " + ct.numBuckets + " BUCKETS\n") + } + + // TODO CatalogTable does not implement skew spec yet + // skew spec + // TODO StorageHandler case is not handled yet, since CatalogTable does not have it yet + // row format + sb.append(" ROW FORMAT ") + + val serdeProps = ct.storage.serdeProperties + val delimiterPrefixes = + Seq("FIELDS TERMINATED BY", + "COLLECTION ITEMS TERMINATED BY", + "MAP KEYS TERMINATED BY", + "LINES TERMINATED BY", + "NULL DEFINED AS") + + val delimiters = Seq( + serdeProps.get("field.delim"), + serdeProps.get("colelction.delim"), + serdeProps.get("mapkey.delim"), + serdeProps.get("line.delim"), + serdeProps.get("serialization.null.format")).zipWithIndex + + val delimiterStrs = delimiters collect { + case (Some(ch), i) => + delimiterPrefixes(i) + " '" + + escapeHiveCommand(ch) + + "' " + } + if (delimiterStrs.size > 0){ + sb.append("DELIMITED ") + sb.append(delimiterStrs.mkString(" ")+"\n") + }else{ + sb.append("SERDE '") + sb.append(escapeHiveCommand(ct.storage.serde.getOrElse(""))+ "' \n") + } + + sb.append("STORED AS INPUTFORMAT '" + + escapeHiveCommand(ct.storage.inputFormat.getOrElse("")) + "' \n") + sb.append("OUTPUTFORMAT '" + + escapeHiveCommand(ct.storage.outputFormat.getOrElse(""))+"' \n") + + // table location + sb.append("LOCATION '" + + escapeHiveCommand(ct.storage.locationUri.getOrElse(""))+"' \n") + + // table properties + val propertPairs = ct.properties collect { + case (k, v) if !processedProperties.contains(k) => + "'" + escapeHiveCommand(k) + "'='"+escapeHiveCommand(v)+"'" + } + if(propertPairs.size>0) + sb.append("TBLPROPERTIES " + propertPairs.mkString("( ", ", \n", " )")+"\n") + + } + sb.toString() + } + + private def escapeHiveCommand(str: String): String = { + str.map{c => + if (c == '\'' || c == ';'){ + '\\' + } else { + c + } + } + } } private[sql] object HiveSessionCatalog { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index d0eb9ddf50ae..b8bfea8efd73 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -309,6 +309,7 @@ private[hive] class HiveClientImpl( }, schema = h.getCols.asScala.map(fromHiveColumn), partitionColumns = h.getPartCols.asScala.map(fromHiveColumn), + bucketColumns = h.getTTable.getSd.getBucketCols.asScala.toSeq, sortColumns = Seq(), numBuckets = h.getNumBuckets, createTime = h.getTTable.getCreateTime.toLong * 1000, From 6d060be797d4127f0b86fa59c1bc848d75215533 Mon Sep 17 00:00:00 2001 From: xin Wu Date: Fri, 1 Apr 2016 23:01:46 -0700 Subject: [PATCH 002/109] update upon review --- .../spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../sql/execution/command/commands.scala | 2 +- .../sql/hive/execution/HiveShowDDLSuite.scala | 144 ++++++++++++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 7930f1e7709f..ce1aa3773d6c 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -118,7 +118,7 @@ statement | SHOW DATABASES (LIKE pattern=STRING)? #showDatabases | SHOW TBLPROPERTIES table=tableIdentifier ('(' key=tablePropertyKey ')')? #showTblProperties - | SHOW CREATE TABLE tableIdentifier + | SHOW CREATE TABLE tableIdentifier #showCreateTable | SHOW FUNCTIONS (LIKE? (qualifiedName | pattern=STRING))? #showFunctions | (DESC | DESCRIBE) FUNCTION EXTENDED? qualifiedName #describeFunction | (DESC | DESCRIBE) option=(EXTENDED | FORMATTED)? diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index cd1f88359e0b..825dc3f7b3b7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -437,7 +437,7 @@ case class ShowCreateTableCommand( // The result of SHOW CREATE TABLE is the whole string of DDL command override val output: Seq[Attribute] = { val schema = StructType( - StructField("DDL", StringType, nullable = false):: Nil) + StructField("Result", StringType, nullable = false):: Nil) schema.toAttributes } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala new file mode 100644 index 000000000000..fc089ae272e3 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala @@ -0,0 +1,144 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.hive.execution + +import org.apache.spark.sql.{AnalysisException, Row, QueryTest} +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils + +/** + * Created by xwu0226 on 4/1/16. + */ +class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { + test("show create table - hive table - no row format"){ + withTable("t1"){ + sql( + """ + |create table t1(c1 int, c2 string) + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive non-external table"){ + withTable("t1"){ + sql( + """ + |create table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' + |row format delimited fields terminated by ',' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive external table"){ + withTable("t1"){ + sql( + """ + |create external table t1(c1 int, c2 string) + |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', + |'my.property.three'='2', 'my.property.four'='false') + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive table - cluster bucket and skew"){ + withTable("t1"){ + sql( + """ + |create external table t1(c1 int COMMENT 'first column', c2 string) + |COMMENT 'xin\'s table' + |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) + |CLUSTERED BY (c1, c2) SORTED BY (c1 ASC, C2 DESC) INTO 5 BUCKETS + |row format delimited fields terminated by ',' + |COLLECTION ITEMS TERMINATED BY ',' + |MAP KEYS TERMINATED BY ',' + |NULL DEFINED AS '\N' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', + |'my.property.three'='2', 'my.property.four'='false') + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive temp table"){ + withTable("t1"){ + sql( + """ + |create TEMPORARY table t1(c1 int, c2 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive TEMPORARY external table"){ + withTable("t1"){ + sql( + """ + |create TEMPORARY external table t1(c1 int, c2 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + """.stripMargin) + sql("show create table t1").show(false) + } + } + + test("show create table - hive view"){ + withTable("t1"){ + withView("v1") { + sql( + """ + |create table t1(c1 int, c2 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location 'file:///home/xwu0226/spark-test/data/t' + """.stripMargin) + sql( + """ + |create view v1 as select * from t1 + """.stripMargin) + sql("show create table v1").show(false) + } + } + } + + test("show create temp table"){ + withTable("t1"){ + sql( + """ + |create temporary table t1(c1 int, c2 string) + """.stripMargin) + sql("show create table t1").show(false) + } + } +} From 2799672162d715b209cad9a5c103d6f09692d8dc Mon Sep 17 00:00:00 2001 From: xin Wu Date: Sat, 2 Apr 2016 11:19:26 -0700 Subject: [PATCH 003/109] ignoring sqlContext temp table and considering datasource table ddl --- .../sql/catalyst/catalog/SessionCatalog.scala | 4 +- .../spark/sql/hive/HiveSessionCatalog.scala | 28 +++++++++-- .../sql/hive/execution/HiveShowDDLSuite.scala | 48 +++++++++++++++++-- 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 283f98ebbc02..d2d017cc4251 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -179,8 +179,8 @@ class SessionCatalog( * Generate Create table DDL string for the specified tableIdentifier */ def generateTableDDL(name: TableIdentifier): String = { - val catalogTable = this.getTable(name) - "CREATE TABLE " + name.toString + throw new AnalysisException( + "SHOW CREATE TABLE command is not supported for temporary tables created in SQLContext.") } // ------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 9759cdf11ee7..dd81f18dee76 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -29,7 +29,7 @@ import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, Gener import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, FunctionResourceLoader, SessionCatalog} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, FunctionResourceLoader, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.catalyst.rules.Rule @@ -246,15 +246,13 @@ private[sql] class HiveSessionCatalog( * Generate Create table DDL string for the specified tableIdentifier * that is from Hive metastore */ - override def generateTableDDL(name: TableIdentifier): String = { - val ct = this.getTable(name) + override def generateHiveDDL(ct: CatalogTable): String = { val sb = new StringBuilder("CREATE ") val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] if (ct.tableType == CatalogTableType.VIRTUAL_VIEW) { sb.append(" VIEW "+ct.qualifiedName+" AS " + ct.viewOriginalText.getOrElse("")) } else { - // TEMPORARY keyword is not applicable for HIVE DDL from Spark SQL yet. if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { processedProperties += "EXTERNAL" sb.append(" EXTERNAL TABLE " + ct.qualifiedName) @@ -362,6 +360,28 @@ private[sql] class HiveSessionCatalog( sb.toString() } + private def generateDataSourceDDL(ct: CatalogTable): String = { + val sb = new StringBuilder("CREATE TABLE " + ct.qualifiedName) + // TODO will continue on generating Datasource syntax DDL + // will remove generateHiveDDL once it is done. + generateHiveDDL(ct) + } + + /** + * Generate Create table DDL string for the specified tableIdentifier + * that is from Hive metastore + */ + override def generateTableDDL(name: TableIdentifier): String = { + val ct = this.getTable(name) + if(ct.properties.get("spark.sql.sources.provider") == None){ + // CREATE [TEMPORARY] TABLE ... ROW FORMAT.. TBLPROPERTIES (...) + generateHiveDDL(ct) + }else{ + // CREATE [TEMPORARY] TABLE .... USING .... OPTIONS (...) + generateDataSourceDDL(ct) + } + } + private def escapeHiveCommand(str: String): String = { str.map{c => if (c == '\'' || c == ';'){ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala index fc089ae272e3..d7130f3a50b4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala @@ -20,10 +20,8 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{AnalysisException, Row, QueryTest} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.util.Utils -/** - * Created by xwu0226 on 4/1/16. - */ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("show create table - hive table - no row format"){ withTable("t1"){ @@ -141,4 +139,48 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto sql("show create table t1").show(false) } } + + test("show create table -- datasource table"){ + withTable("t_datasource"){ + sql("select 1, 'abc'").write.saveAsTable("t_datasource") + sql("show create table t_datasource").show(false) + } + } + + test("show create table -- partitioned"){ + val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + withTable("t_datasource"){ + val df = sqlContext.read.json(jsonFilePath) + df.write.format("json").saveAsTable("t_datasource") + sql("show create table t_datasource").show(false) + } + } + + test("show create table -- USING and OPTIONS") { + val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + withTable("jsonTable") { + sql( + s"""CREATE TABLE jsonTable + |USING org.apache.spark.sql.json.DefaultSource + |OPTIONS ( + | path '$jsonFilePath' + |) + """.stripMargin) + sql("show create table jsonTable").show(false) + } + } + + test("show create table -- USING and OPTIONS with column definition") { + val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + withTable("jsonTable") { + sql( + s"""CREATE TABLE jsonTable (c1 string, c2 string, c3 int) + |USING org.apache.spark.sql.json.DefaultSource + |OPTIONS ( + | path '$jsonFilePath' + |) + """.stripMargin) + sql("show create table jsonTable").show(false) + } + } } From 98c020aa9a5374861d1470fa0c305148e8314ada Mon Sep 17 00:00:00 2001 From: xin Wu Date: Mon, 4 Apr 2016 14:54:32 -0700 Subject: [PATCH 004/109] fix scala style issue --- .../sql/execution/command/commands.scala | 2 +- .../spark/sql/hive/HiveSessionCatalog.scala | 32 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index 825dc3f7b3b7..f8476033392f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -441,7 +441,7 @@ case class ShowCreateTableCommand( schema.toAttributes } - override def run(sqlContext: SQLContext): Seq[Row] ={ + override def run(sqlContext: SQLContext): Seq[Row] = { val catalog = sqlContext.sessionState.catalog val db = databaseName.getOrElse(catalog.getCurrentDatabase) Seq(Row(catalog.generateTableDDL(TableIdentifier(tableName, databaseName)))) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index dd81f18dee76..da8a4527e708 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -251,7 +251,7 @@ private[sql] class HiveSessionCatalog( val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] if (ct.tableType == CatalogTableType.VIRTUAL_VIEW) { - sb.append(" VIEW "+ct.qualifiedName+" AS " + ct.viewOriginalText.getOrElse("")) + sb.append(" VIEW " + ct.qualifiedName + " AS " + ct.viewOriginalText.getOrElse("")) } else { if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { processedProperties += "EXTERNAL" @@ -267,7 +267,7 @@ private[sql] class HiveSessionCatalog( }) // hive ddl does not honor NOT NULL, it is always default to be nullable } - sb.append(cols.mkString("(", ", ", ")")+"\n") + sb.append(cols.mkString("(", ", ", ")") + "\n") // table comment sb.append(" " + @@ -287,7 +287,7 @@ private[sql] class HiveSessionCatalog( } if (partCols != null && partCols.size > 0) { sb.append(" PARTITIONED BY ") - sb.append(partCols.mkString("( ", ", ", " )")+"\n") + sb.append(partCols.mkString("( ", ", ", " )") + "\n") } // sort bucket @@ -331,31 +331,31 @@ private[sql] class HiveSessionCatalog( escapeHiveCommand(ch) + "' " } - if (delimiterStrs.size > 0){ + if (delimiterStrs.size > 0) { sb.append("DELIMITED ") - sb.append(delimiterStrs.mkString(" ")+"\n") - }else{ + sb.append(delimiterStrs.mkString(" ") + "\n") + } else { sb.append("SERDE '") - sb.append(escapeHiveCommand(ct.storage.serde.getOrElse(""))+ "' \n") + sb.append(escapeHiveCommand(ct.storage.serde.getOrElse("")) + "' \n") } sb.append("STORED AS INPUTFORMAT '" + escapeHiveCommand(ct.storage.inputFormat.getOrElse("")) + "' \n") sb.append("OUTPUTFORMAT '" + - escapeHiveCommand(ct.storage.outputFormat.getOrElse(""))+"' \n") + escapeHiveCommand(ct.storage.outputFormat.getOrElse("")) + "' \n") // table location sb.append("LOCATION '" + - escapeHiveCommand(ct.storage.locationUri.getOrElse(""))+"' \n") + escapeHiveCommand(ct.storage.locationUri.getOrElse("")) + "' \n") // table properties val propertPairs = ct.properties collect { case (k, v) if !processedProperties.contains(k) => - "'" + escapeHiveCommand(k) + "'='"+escapeHiveCommand(v)+"'" + "'" + escapeHiveCommand(k) + "'='" + escapeHiveCommand(v) + "'" + } + if (propertPairs.size>0) { + sb.append("TBLPROPERTIES " + propertPairs.mkString("( ", ", \n", " )") + "\n") } - if(propertPairs.size>0) - sb.append("TBLPROPERTIES " + propertPairs.mkString("( ", ", \n", " )")+"\n") - } sb.toString() } @@ -373,10 +373,10 @@ private[sql] class HiveSessionCatalog( */ override def generateTableDDL(name: TableIdentifier): String = { val ct = this.getTable(name) - if(ct.properties.get("spark.sql.sources.provider") == None){ + if(ct.properties.get("spark.sql.sources.provider") == None) { // CREATE [TEMPORARY] TABLE ... ROW FORMAT.. TBLPROPERTIES (...) generateHiveDDL(ct) - }else{ + } else { // CREATE [TEMPORARY] TABLE .... USING .... OPTIONS (...) generateDataSourceDDL(ct) } @@ -384,7 +384,7 @@ private[sql] class HiveSessionCatalog( private def escapeHiveCommand(str: String): String = { str.map{c => - if (c == '\'' || c == ';'){ + if (c == '\'' || c == ';') { '\\' } else { c From efd889821bf84e328ef6dd8d0b6a645729248251 Mon Sep 17 00:00:00 2001 From: xin Wu Date: Mon, 4 Apr 2016 15:40:26 -0700 Subject: [PATCH 005/109] fix scala style issue in testcase --- .../sql/hive/execution/HiveShowDDLSuite.scala | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala index d7130f3a50b4..0466686ef8bb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala @@ -17,14 +17,14 @@ package org.apache.spark.sql.hive.execution -import org.apache.spark.sql.{AnalysisException, Row, QueryTest} +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { - test("show create table - hive table - no row format"){ - withTable("t1"){ + test("show create table - hive table - no row format") { + withTable("t1") { sql( """ |create table t1(c1 int, c2 string) @@ -35,8 +35,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive non-external table"){ - withTable("t1"){ + test("show create table - hive non-external table") { + withTable("t1") { sql( """ |create table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' @@ -48,8 +48,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive external table"){ - withTable("t1"){ + test("show create table - hive external table") { + withTable("t1") { sql( """ |create external table t1(c1 int, c2 string) @@ -64,8 +64,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive table - cluster bucket and skew"){ - withTable("t1"){ + test("show create table - hive table - cluster bucket and skew") { + withTable("t1") { sql( """ |create external table t1(c1 int COMMENT 'first column', c2 string) @@ -85,8 +85,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive temp table"){ - withTable("t1"){ + test("show create table - hive temp table") { + withTable("t1") { sql( """ |create TEMPORARY table t1(c1 int, c2 string) @@ -98,8 +98,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive TEMPORARY external table"){ - withTable("t1"){ + test("show create table - hive TEMPORARY external table") { + withTable("t1") { sql( """ |create TEMPORARY external table t1(c1 int, c2 string) @@ -111,8 +111,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table - hive view"){ - withTable("t1"){ + test("show create table - hive view") { + withTable("t1") { withView("v1") { sql( """ @@ -130,8 +130,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create temp table"){ - withTable("t1"){ + test("show create temp table") { + withTable("t1") { sql( """ |create temporary table t1(c1 int, c2 string) @@ -140,16 +140,16 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - test("show create table -- datasource table"){ - withTable("t_datasource"){ + test("show create table -- datasource table") { + withTable("t_datasource") { sql("select 1, 'abc'").write.saveAsTable("t_datasource") sql("show create table t_datasource").show(false) } } - test("show create table -- partitioned"){ + test("show create table -- partitioned") { val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile - withTable("t_datasource"){ + withTable("t_datasource") { val df = sqlContext.read.json(jsonFilePath) df.write.format("json").saveAsTable("t_datasource") sql("show create table t_datasource").show(false) From b370630f5827071bc5076e9b3fa9c92720b27eb2 Mon Sep 17 00:00:00 2001 From: xin Wu Date: Mon, 4 Apr 2016 18:31:46 -0700 Subject: [PATCH 006/109] fix testcase for test failure --- .../sql/hive/execution/HiveQuerySuite.scala | 1 - .../sql/hive/execution/HiveShowDDLSuite.scala | 181 ++++++++++-------- 2 files changed, 99 insertions(+), 83 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 0c57ede9ed0a..8bcf8a4d991b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -1275,7 +1275,6 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { } test("some show commands are not supported") { - assertUnsupportedFeature { sql("SHOW CREATE TABLE my_table") } assertUnsupportedFeature { sql("SHOW COMPACTIONS") } assertUnsupportedFeature { sql("SHOW TRANSACTIONS") } assertUnsupportedFeature { sql("SHOW INDEXES ON my_table") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala index 0466686ef8bb..7e39daa47afc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala @@ -22,110 +22,125 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils -class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { +class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("show create table - hive table - no row format") { - withTable("t1") { - sql( - """ - |create table t1(c1 int, c2 string) - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - """.stripMargin) - sql("show create table t1").show(false) + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create table t1(c1 int, c2 string) + |stored as parquet + |location '${tmpDir}' + """.stripMargin) + sql("show create table t1").show(false) + } } } test("show create table - hive non-external table") { - withTable("t1") { - sql( - """ - |create table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' - |row format delimited fields terminated by ',' - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - """.stripMargin) - sql("show create table t1").show(false) + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' + |row format delimited fields terminated by ',' + |stored as parquet + |location '${tmpDir}' + """.stripMargin) + sql("show create table t1").show(false) + } } } test("show create table - hive external table") { - withTable("t1") { - sql( - """ - |create external table t1(c1 int, c2 string) - |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) - |row format delimited fields terminated by ',' - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', - |'my.property.three'='2', 'my.property.four'='false') - """.stripMargin) - sql("show create table t1").show(false) + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create external table t1(c1 int, c2 string) + |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location '${tmpDir}' + |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', + |'my.property.three'='2', 'my.property.four'='false') + """.stripMargin) + sql("show create table t1").show(false) + } } } test("show create table - hive table - cluster bucket and skew") { - withTable("t1") { - sql( - """ - |create external table t1(c1 int COMMENT 'first column', c2 string) - |COMMENT 'xin\'s table' - |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) - |CLUSTERED BY (c1, c2) SORTED BY (c1 ASC, C2 DESC) INTO 5 BUCKETS - |row format delimited fields terminated by ',' - |COLLECTION ITEMS TERMINATED BY ',' - |MAP KEYS TERMINATED BY ',' - |NULL DEFINED AS '\N' - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', - |'my.property.three'='2', 'my.property.four'='false') - """.stripMargin) - sql("show create table t1").show(false) + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create external table t1(c1 int COMMENT 'first column', c2 string) + |COMMENT 'some table' + |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) + |CLUSTERED BY (c1, c2) SORTED BY (c1 ASC, C2 DESC) INTO 5 BUCKETS + |row format delimited fields terminated by ',' + |COLLECTION ITEMS TERMINATED BY ',' + |MAP KEYS TERMINATED BY ',' + |NULL DEFINED AS 'NnN' + |stored as parquet + |location '${tmpDir}' + |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', + |'my.property.three'='2', 'my.property.four'='false') + """.stripMargin) + sql("show create table t1").show(false) + } } } - test("show create table - hive temp table") { - withTable("t1") { - sql( - """ - |create TEMPORARY table t1(c1 int, c2 string) - |row format delimited fields terminated by ',' - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - """.stripMargin) - sql("show create table t1").show(false) + test( + "show create table - hive temp table") { + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create TEMPORARY table t1(c1 int, c2 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location '${tmpDir}' + """.stripMargin) + sql("show create table t1").show(false) + } } } test("show create table - hive TEMPORARY external table") { - withTable("t1") { - sql( - """ - |create TEMPORARY external table t1(c1 int, c2 string) - |row format delimited fields terminated by ',' - |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - """.stripMargin) - sql("show create table t1").show(false) - } - } - - test("show create table - hive view") { - withTable("t1") { - withView("v1") { + withTempDir { tmpDir => + withTable("t1") { sql( - """ - |create table t1(c1 int, c2 string) + s""" + |create TEMPORARY external table t1(c1 int, c2 string) |row format delimited fields terminated by ',' |stored as parquet - |location 'file:///home/xwu0226/spark-test/data/t' - """.stripMargin) - sql( - """ - |create view v1 as select * from t1 + |location '${tmpDir}' """.stripMargin) - sql("show create table v1").show(false) + sql("show create table t1").show(false) + } + } + } + + test("show create table - hive view") { + withTempDir { tmpDir => + withTable("t1") { + withView("v1") { + sql( + s""" + |create table t1(c1 int, c2 string) + |row format delimited fields terminated by ',' + |stored as parquet + |location '${tmpDir}' + """.stripMargin) + sql( + """ + |create view v1 as select * from t1 + """.stripMargin) + sql("show create table v1").show(false) + } } } } @@ -160,7 +175,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile withTable("jsonTable") { sql( - s"""CREATE TABLE jsonTable + s""" + |CREATE TABLE jsonTable |USING org.apache.spark.sql.json.DefaultSource |OPTIONS ( | path '$jsonFilePath' @@ -174,7 +190,8 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile withTable("jsonTable") { sql( - s"""CREATE TABLE jsonTable (c1 string, c2 string, c3 int) + s""" + |CREATE TABLE jsonTable (c1 string, c2 string, c3 int) |USING org.apache.spark.sql.json.DefaultSource |OPTIONS ( | path '$jsonFilePath' From 8cb7a7299df84f2608b91b092a7df6795b85d41e Mon Sep 17 00:00:00 2001 From: xin Wu Date: Wed, 6 Apr 2016 11:12:07 -0700 Subject: [PATCH 007/109] continue the database ddl generation --- .../spark/sql/hive/HiveSessionCatalog.scala | 84 ++++++++++++++----- .../{execution => }/HiveShowDDLSuite.scala | 17 ++-- .../sql/hive/MetastoreDataSourcesSuite.scala | 23 +++++ 3 files changed, 97 insertions(+), 27 deletions(-) rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{execution => }/HiveShowDDLSuite.scala (93%) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index da8a4527e708..b414a94966c1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -241,33 +241,41 @@ private[sql] class HiveSessionCatalog( val info = new ExpressionInfo(clazz.getCanonicalName, functionName) createTempFunction(functionName, info, builder, ignoreIfExists = false) } + + private def generateCreateTableHeader( + ct: CatalogTable, + processedProps: scala.collection.mutable.ArrayBuffer[String]): String = { + if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { + processedProps += "EXTERNAL" + "CREATE EXTERNAL TABLE " + ct.qualifiedName + } else { + "CREATE TABLE " + ct.qualifiedName + } + } + private def generateCols(ct: CatalogTable): String = { + val cols = ct.schema map { col => + col.name + " " + col.dataType + (col.comment.getOrElse("") match { + case cmt: String if cmt.length > 0 => " COMMENT '" + escapeHiveCommand(cmt) + "'" + case _ => "" + }) + } + cols.mkString("(", ", ", ")") + } + /** * Generate Create table DDL string for the specified tableIdentifier * that is from Hive metastore */ override def generateHiveDDL(ct: CatalogTable): String = { - val sb = new StringBuilder("CREATE ") + val sb = new StringBuilder("") val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] if (ct.tableType == CatalogTableType.VIRTUAL_VIEW) { - sb.append(" VIEW " + ct.qualifiedName + " AS " + ct.viewOriginalText.getOrElse("")) + sb.append("CREATE VIEW " + ct.qualifiedName + " AS " + ct.viewOriginalText.getOrElse("")) } else { - if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { - processedProperties += "EXTERNAL" - sb.append(" EXTERNAL TABLE " + ct.qualifiedName) - } else { - sb.append(" TABLE " + ct.qualifiedName) - } - // column list - val cols = ct.schema map { col => - col.name + " " + col.dataType + (col.comment.getOrElse("") match { - case cmt: String if cmt.length > 0 => " COMMENT '" + escapeHiveCommand(cmt) + "'" - case _ => "" - }) - // hive ddl does not honor NOT NULL, it is always default to be nullable - } - sb.append(cols.mkString("(", ", ", ")") + "\n") + sb.append(generateCreateTableHeader(ct, processedProperties) + "\n") + sb.append(generateCols(ct) + "\n") // table comment sb.append(" " + @@ -311,6 +319,9 @@ private[sql] class HiveSessionCatalog( sb.append(" ROW FORMAT ") val serdeProps = ct.storage.serdeProperties + val processedSerdeProps = + Seq("field.delim", "colelction.delim", "mapkey.delim", "line.delim", + "serialization.null.format") val delimiterPrefixes = Seq("FIELDS TERMINATED BY", "COLLECTION ITEMS TERMINATED BY", @@ -339,6 +350,14 @@ private[sql] class HiveSessionCatalog( sb.append(escapeHiveCommand(ct.storage.serde.getOrElse("")) + "' \n") } + val leftOverSerdeProps = serdeProps.filter(e => !processedSerdeProps.contains(e._1)) + if (leftOverSerdeProps.size > 0) { + sb.append("WITH SERDEPROPERTIES \n") + sb.append( + leftOverSerdeProps.map(e => "'" + e._1 + "'='" + e._2 + "'"). + mkString("( ", ", ", " )\n")) + } + sb.append("STORED AS INPUTFORMAT '" + escapeHiveCommand(ct.storage.inputFormat.getOrElse("")) + "' \n") sb.append("OUTPUTFORMAT '" + @@ -360,11 +379,34 @@ private[sql] class HiveSessionCatalog( sb.toString() } + /** + * Generate DDL for datasource tables that are created by following ways: + * 1. CREATE [TEMPORARY] TABLE .... USING .... OPTIONS(.....) + * 2. DF.write.format("parquet").saveAsTable("t1") + * @param ct spark sql version of table metadator loaded + * @return DDL string + */ private def generateDataSourceDDL(ct: CatalogTable): String = { - val sb = new StringBuilder("CREATE TABLE " + ct.qualifiedName) - // TODO will continue on generating Datasource syntax DDL - // will remove generateHiveDDL once it is done. - generateHiveDDL(ct) + val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] + val sb = new StringBuilder(generateCreateTableHeader(ct, processedProperties)) + // It is possible that the column list returned from hive metastore is just a dummy + // one, such as "col array", because the metastore was created as spark sql + // specific metastore (refer to HiveMetaStoreCatalog.createDataSourceTable. + // newSparkSQLSpecificMetastoreTable). In such case, the column schema information + // is located in tblproperties in json format. However, the data types in the json string + // spark sql types, e.g.: Long type for hive's bigint. So if we do parse this json string + // and contruct the ddl with such spark sql column type, it may be not executable when it + // comes to hive native command. We may need to wait till spark sql native command to be ready + // in order to decide how to deal with this case. + sb.append(generateCols(ct) + "\n") + sb.append("USING " + ct.properties.get("spark.sql.sources.provider").get + "\n") + sb.append("OPTIONS ") + val options = scala.collection.mutable.ArrayBuffer.empty[String] + ct.storage.serdeProperties.foreach { e => + options += "" + escapeHiveCommand(e._1) + " '" + escapeHiveCommand(e._2) + "'" + } + if (options.size > 0) sb.append(options.mkString("( ", ", \n", " )")) + sb.toString } /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala similarity index 93% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala index 7e39daa47afc..b73483b3958d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveShowDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql.hive.execution +package org.apache.spark.sql.hive -import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils @@ -57,7 +57,7 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto withTable("t1") { sql( s""" - |create external table t1(c1 int, c2 string) + |create external table t1(c1 long, c2 string) |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) |row format delimited fields terminated by ',' |stored as parquet @@ -157,7 +157,9 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto test("show create table -- datasource table") { withTable("t_datasource") { - sql("select 1, 'abc'").write.saveAsTable("t_datasource") + val df = sql("select 1, 'abc'") + df.write.saveAsTable("t_datasource") + sql("select * from t_datasource").show sql("show create table t_datasource").show(false) } } @@ -166,7 +168,7 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile withTable("t_datasource") { val df = sqlContext.read.json(jsonFilePath) - df.write.format("json").saveAsTable("t_datasource") + df.write.partitionBy("a").format("json").saveAsTable("t_datasource") sql("show create table t_datasource").show(false) } } @@ -179,7 +181,9 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |CREATE TABLE jsonTable |USING org.apache.spark.sql.json.DefaultSource |OPTIONS ( - | path '$jsonFilePath' + | path '$jsonFilePath', + | key.key1 'value1', + | 'key.key2' 'value2' |) """.stripMargin) sql("show create table jsonTable").show(false) @@ -187,6 +191,7 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("show create table -- USING and OPTIONS with column definition") { + val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile withTable("jsonTable") { sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 3c299daa778c..cc407122befc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -937,4 +937,27 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv }) } } + + test("SPARK-14346: show create table created with datasource -- partitioned") { + withTable("ttt3") { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write + .partitionBy("a") + .format("parquet") + .mode(SaveMode.Overwrite) + .saveAsTable("ttt3") + sql("show create table ttt3").show(false) + } + } + + test("SPARK-14346: show create table created with datasource") { + withTable("ttt3") { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write + .format("parquet") + .mode(SaveMode.Overwrite) + .saveAsTable("ttt3") + sql("show create table ttt3").show(false) + } + } } From 8b67d22c5ed8fd6b309df772e4a372e741acf630 Mon Sep 17 00:00:00 2001 From: xin Wu Date: Fri, 8 Apr 2016 13:57:12 -0700 Subject: [PATCH 008/109] support datasource ddl --- .../sql/execution/command/commands.scala | 1 - .../spark/sql/hive/HiveSessionCatalog.scala | 107 +++--- .../spark/sql/hive/HiveShowDDLSuite.scala | 316 +++++++++++++++--- .../sql/hive/MetastoreDataSourcesSuite.scala | 23 -- 4 files changed, 318 insertions(+), 129 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index f8476033392f..15caa2ebaa5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -443,7 +443,6 @@ case class ShowCreateTableCommand( override def run(sqlContext: SQLContext): Seq[Row] = { val catalog = sqlContext.sessionState.catalog - val db = databaseName.getOrElse(catalog.getCurrentDatabase) Seq(Row(catalog.generateTableDDL(TableIdentifier(tableName, databaseName)))) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index b414a94966c1..aa5ae8ff4252 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.hive +import org.apache.spark.sql.catalyst.util._ + +import scala.collection.immutable.Map.Map4 import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal @@ -38,7 +41,7 @@ import org.apache.spark.sql.execution.datasources.BucketSpec import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{ArrayType, StructField, DataType, StructType} import org.apache.spark.util.Utils @@ -241,10 +244,10 @@ private[sql] class HiveSessionCatalog( val info = new ExpressionInfo(clazz.getCanonicalName, functionName) createTempFunction(functionName, info, builder, ignoreIfExists = false) } - + private def generateCreateTableHeader( ct: CatalogTable, - processedProps: scala.collection.mutable.ArrayBuffer[String]): String = { + processedProps: scala.collection.mutable.ArrayBuffer[String]): String = { if (ct.tableType == CatalogTableType.EXTERNAL_TABLE) { processedProps += "EXTERNAL" "CREATE EXTERNAL TABLE " + ct.qualifiedName @@ -255,19 +258,15 @@ private[sql] class HiveSessionCatalog( private def generateCols(ct: CatalogTable): String = { val cols = ct.schema map { col => - col.name + " " + col.dataType + (col.comment.getOrElse("") match { + "`" + col.name + "` " + col.dataType + (col.comment.getOrElse("") match { case cmt: String if cmt.length > 0 => " COMMENT '" + escapeHiveCommand(cmt) + "'" case _ => "" }) } cols.mkString("(", ", ", ")") } - - /** - * Generate Create table DDL string for the specified tableIdentifier - * that is from Hive metastore - */ - override def generateHiveDDL(ct: CatalogTable): String = { + + private def generateHiveDDL(ct: CatalogTable): String = { val sb = new StringBuilder("") val processedProperties = scala.collection.mutable.ArrayBuffer.empty[String] @@ -304,7 +303,7 @@ private[sql] class HiveSessionCatalog( sb.append(" CLUSTERED BY ") sb.append(ct.bucketColumns.mkString("( ", ", ", " )")) - // TODO sort columns don't have the the right scala types yet. need to adapt to Hive Order + // TODO sort columns don't have the the right types yet. need to adapt to Hive Order if (ct.sortColumns.size > 0) { sb.append(" SORTED BY ") sb.append(ct.sortColumns.map(_.name).mkString("( ", ", ", " )")) @@ -319,43 +318,19 @@ private[sql] class HiveSessionCatalog( sb.append(" ROW FORMAT ") val serdeProps = ct.storage.serdeProperties - val processedSerdeProps = - Seq("field.delim", "colelction.delim", "mapkey.delim", "line.delim", - "serialization.null.format") - val delimiterPrefixes = - Seq("FIELDS TERMINATED BY", - "COLLECTION ITEMS TERMINATED BY", - "MAP KEYS TERMINATED BY", - "LINES TERMINATED BY", - "NULL DEFINED AS") - - val delimiters = Seq( - serdeProps.get("field.delim"), - serdeProps.get("colelction.delim"), - serdeProps.get("mapkey.delim"), - serdeProps.get("line.delim"), - serdeProps.get("serialization.null.format")).zipWithIndex - - val delimiterStrs = delimiters collect { - case (Some(ch), i) => - delimiterPrefixes(i) + " '" + - escapeHiveCommand(ch) + - "' " - } - if (delimiterStrs.size > 0) { - sb.append("DELIMITED ") - sb.append(delimiterStrs.mkString(" ") + "\n") - } else { - sb.append("SERDE '") - sb.append(escapeHiveCommand(ct.storage.serde.getOrElse("")) + "' \n") - } + // potentially for serde properties that should be ignored + val processedSerdeProps = Seq() + + sb.append("SERDE '") + sb.append(escapeHiveCommand(ct.storage.serde.getOrElse("")) + "' \n") val leftOverSerdeProps = serdeProps.filter(e => !processedSerdeProps.contains(e._1)) if (leftOverSerdeProps.size > 0) { sb.append("WITH SERDEPROPERTIES \n") sb.append( - leftOverSerdeProps.map(e => "'" + e._1 + "'='" + e._2 + "'"). - mkString("( ", ", ", " )\n")) + leftOverSerdeProps.map { e => + "'" + escapeHiveCommand(e._1) + "'='" + escapeHiveCommand(e._2) + "'" + }.mkString("( ", ", ", " )\n")) } sb.append("STORED AS INPUTFORMAT '" + @@ -393,12 +368,8 @@ private[sql] class HiveSessionCatalog( // one, such as "col array", because the metastore was created as spark sql // specific metastore (refer to HiveMetaStoreCatalog.createDataSourceTable. // newSparkSQLSpecificMetastoreTable). In such case, the column schema information - // is located in tblproperties in json format. However, the data types in the json string - // spark sql types, e.g.: Long type for hive's bigint. So if we do parse this json string - // and contruct the ddl with such spark sql column type, it may be not executable when it - // comes to hive native command. We may need to wait till spark sql native command to be ready - // in order to decide how to deal with this case. - sb.append(generateCols(ct) + "\n") + // is located in tblproperties in json format. + sb.append(generateColsDataSource(ct, processedProperties) + "\n") sb.append("USING " + ct.properties.get("spark.sql.sources.provider").get + "\n") sb.append("OPTIONS ") val options = scala.collection.mutable.ArrayBuffer.empty[String] @@ -409,18 +380,46 @@ private[sql] class HiveSessionCatalog( sb.toString } + private def generateColsDataSource( + ct: CatalogTable, + processedProps: scala.collection.mutable.ArrayBuffer[String]): String = { + val schemaStringFromParts: Option[String] = { + ct.properties.get("spark.sql.sources.schema.numParts").map { numParts => + val parts = (0 until numParts.toInt).map { index => + val part = ct.properties.get(s"spark.sql.sources.schema.part.$index").orNull + if (part == null) { + throw new AnalysisException( + "Could not read schema from the metastore because it is corrupted " + + s"(missing part $index of the schema, $numParts parts are expected).") + } + part + } + // Stick all parts back to a single schema string. + parts.mkString + } + } + + if (schemaStringFromParts.isDefined) { + (schemaStringFromParts.map(s => DataType.fromJson(s).asInstanceOf[StructType]). + get map { f => s"${quoteIdentifier(f.name)} ${f.dataType.sql}" }) + .mkString("( ", ", ", " )") + } else { + "" + } + } + /** * Generate Create table DDL string for the specified tableIdentifier * that is from Hive metastore */ override def generateTableDDL(name: TableIdentifier): String = { - val ct = this.getTable(name) - if(ct.properties.get("spark.sql.sources.provider") == None) { - // CREATE [TEMPORARY] TABLE ... ROW FORMAT.. TBLPROPERTIES (...) - generateHiveDDL(ct) - } else { + val ct = this.getTableMetadata(name) + if(ct.properties.get("spark.sql.sources.provider").isDefined) { // CREATE [TEMPORARY] TABLE .... USING .... OPTIONS (...) generateDataSourceDDL(ct) + } else { + // CREATE [TEMPORARY] TABLE ... ROW FORMAT.. TBLPROPERTIES (...) + generateHiveDDL(ct) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala index b73483b3958d..bcab62bcd10b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowDDLSuite.scala @@ -18,12 +18,30 @@ package org.apache.spark.sql.hive import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogTable} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types._ import org.apache.spark.util.Utils class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { - test("show create table - hive table - no row format") { + import hiveContext.implicits._ + + var jsonFilePath: String = _ + override def beforeAll(): Unit = { + super.beforeAll() + jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + } + // there are 3 types of create table to test + // 1. HIVE Syntax: create table t1 (c1 int) partitionedby (c2 int) row format... tblproperties.. + // 2. Spark sql syntx: crate table t1 (c1 int) using .. options (... ) + // 3. saving table from datasource: df.write.format("parquet").saveAsTable("t1") + + /** + * Hive syntax DDL + */ + test("Hive syntax DDL: no row format") { withTempDir { tmpDir => withTable("t1") { sql( @@ -32,32 +50,34 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |stored as parquet |location '${tmpDir}' """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"), sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test("show create table - hive non-external table") { + test("Hive syntax DDL - external table with column and table comments") { withTempDir { tmpDir => withTable("t1") { sql( s""" - |create table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' + |create external table t1(c1 int COMMENT 'abc', c2 string) COMMENT 'my table' |row format delimited fields terminated by ',' |stored as parquet |location '${tmpDir}' """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test("show create table - hive external table") { + test("Hive syntax DDL - partitioned, row format and tblproperties") { withTempDir { tmpDir => withTable("t1") { sql( s""" - |create external table t1(c1 long, c2 string) + |create external table t1(c1 bigint, c2 string) |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) |row format delimited fields terminated by ',' |stored as parquet @@ -65,12 +85,13 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', |'my.property.three'='2', 'my.property.four'='false') """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test("show create table - hive table - cluster bucket and skew") { + test("Hive syntax DDL - cluster buckets and more row format definition") { withTempDir { tmpDir => withTable("t1") { sql( @@ -78,23 +99,23 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |create external table t1(c1 int COMMENT 'first column', c2 string) |COMMENT 'some table' |PARTITIONED BY (c3 int COMMENT 'partition column', c4 string) - |CLUSTERED BY (c1, c2) SORTED BY (c1 ASC, C2 DESC) INTO 5 BUCKETS + |CLUSTERED BY (c1, c2) INTO 5 BUCKETS |row format delimited fields terminated by ',' |COLLECTION ITEMS TERMINATED BY ',' |MAP KEYS TERMINATED BY ',' - |NULL DEFINED AS 'NnN' + |NULL DEFINED AS 'NaN' |stored as parquet |location '${tmpDir}' |TBLPROPERTIES ('my.property.one'='true', 'my.property.two'='1', |'my.property.three'='2', 'my.property.four'='false') """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test( - "show create table - hive temp table") { + test("Hive syntax DDL - temp table") { withTempDir { tmpDir => withTable("t1") { sql( @@ -104,12 +125,13 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |stored as parquet |location '${tmpDir}' """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test("show create table - hive TEMPORARY external table") { + test("Hive syntax DDL - TEMPORARY external table") { withTempDir { tmpDir => withTable("t1") { sql( @@ -119,12 +141,13 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |stored as parquet |location '${tmpDir}' """.stripMargin) - sql("show create table t1").show(false) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) } } } - test("show create table - hive view") { + test("Hive syntax DDL - hive view") { withTempDir { tmpDir => withTable("t1") { withView("v1") { @@ -139,42 +162,70 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto """ |create view v1 as select * from t1 """.stripMargin) - sql("show create table v1").show(false) + assert(compareCatalog( + TableIdentifier("v1"),sql("show create table v1").collect()(0).toSeq(0).toString)) } } } } - test("show create temp table") { - withTable("t1") { - sql( - """ - |create temporary table t1(c1 int, c2 string) - """.stripMargin) - sql("show create table t1").show(false) + test("Hive syntax DDL - SERDE ") { + withTempDir { tmpDir => + withTable("t1") { + sql( + s""" + |create table t1(c1 int, c2 string) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |WITH SERDEPROPERTIES ('mapkey.delim'=',', 'field.delim'=',') + |STORED AS + |INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + |OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |location '${tmpDir}' + """.stripMargin) + assert(compareCatalog( + TableIdentifier("t1"),sql("show create table t1").collect()(0).toSeq(0).toString)) + } } } - test("show create table -- datasource table") { - withTable("t_datasource") { - val df = sql("select 1, 'abc'") - df.write.saveAsTable("t_datasource") - sql("select * from t_datasource").show - sql("show create table t_datasource").show(false) + /** + * Datasource table syntax DDL + */ + test("Datasource Table DDL syntax - persistent JSON table with a user specified schema") { + withTable("jsonTable") { + sql( + s""" + |CREATE TABLE jsonTable ( + |a string, + |b String, + |`c_!@(3)` int, + |`` Struct<`d!`:array, `=`:array>>) + |USING org.apache.spark.sql.json.DefaultSource + |OPTIONS ( + | path '$jsonFilePath') + """.stripMargin) + assert(compareCatalog( + TableIdentifier("jsonTable"), + sql("show create table jsonTable").collect()(0).toSeq(0).toString)) } } - test("show create table -- partitioned") { - val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile - withTable("t_datasource") { - val df = sqlContext.read.json(jsonFilePath) - df.write.partitionBy("a").format("json").saveAsTable("t_datasource") - sql("show create table t_datasource").show(false) + test("Datasource Table DDL syntax - persistent JSON with a subset of user-specified fields") { + withTable("jsonTable") { + // This works because JSON objects are self-describing and JSONRelation can get needed + // field values based on field names. + sql( + s"""CREATE TABLE jsonTable (`` Struct<`=`:array>>, b String) + |USING org.apache.spark.sql.json.DefaultSource + |OPTIONS (path '$jsonFilePath') + """.stripMargin) + assert(compareCatalog( + TableIdentifier("jsonTable"), + sql("show create table jsonTable").collect()(0).toSeq(0).toString)) } } - test("show create table -- USING and OPTIONS") { - val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + test("Datasource Table DDL syntax - USING and OPTIONS - no user-specified schema") { withTable("jsonTable") { sql( s""" @@ -186,23 +237,186 @@ class HiveShowDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleto | 'key.key2' 'value2' |) """.stripMargin) - sql("show create table jsonTable").show(false) + assert(compareCatalog( + TableIdentifier("jsonTable"), + sql("show create table jsonTable").collect()(0).toSeq(0).toString)) } } - test("show create table -- USING and OPTIONS with column definition") { - - val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile - withTable("jsonTable") { + test("Datasource Table DDL syntax - USING and NO options") { + withTable("parquetTable") { sql( s""" - |CREATE TABLE jsonTable (c1 string, c2 string, c3 int) - |USING org.apache.spark.sql.json.DefaultSource - |OPTIONS ( - | path '$jsonFilePath' - |) + |CREATE TABLE parquetTable (c1 int, c2 string, c3 long) + |USING parquet """.stripMargin) - sql("show create table jsonTable").show(false) + assert(compareCatalog( + TableIdentifier("parquetTable"), + sql("show create table parquetTable").collect()(0).toSeq(0).toString)) + } + } + + /** + * Datasource saved to table + */ + test("Save datasource to table - dataframe with a select ") { + withTable("t_datasource") { + val df = sql("select 1, 'abc'") + df.write.saveAsTable("t_datasource") + assert(compareCatalog( + TableIdentifier("t_datasource"), + sql("show create table t_datasource").collect()(0).toSeq(0).toString)) + } + } + + test("Save datasource to table - dataframe from json file") { + val jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile + withTable("t_datasource") { + val df = sqlContext.read.json(jsonFilePath) + df.write.format("json").saveAsTable("t_datasource") + assert(compareCatalog( + TableIdentifier("t_datasource"), + sql("show create table t_datasource").collect()(0).toSeq(0).toString)) + } + } + + test("Save datasource to table -- dataframe with user-specified schema and partitioned") { + // TODO partitioning information will be lost in the DDL, because datasource DDL syntax + // does not have a place to keep the partitioning columns. + withTable("ttt3") { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write + .partitionBy("a") + .format("parquet") + .mode(SaveMode.Overwrite) + .saveAsTable("ttt3") + assert(compareCatalog( + TableIdentifier("ttt3"), + sql("show create table ttt3").collect()(0).toSeq(0).toString)) + } + } + + /** + * In order to verify whether the generated DDL from a table is correct, we can + * compare the CatalogTable generated from the existing table to the CatalogTable + * generated from the table created with the the generated DDL + * @param expectedTable + * @param actualDDL + * @return true or false + */ + private def compareCatalog(expectedTable: TableIdentifier, actualDDL: String): Boolean = { + val actualTable = expectedTable.table + "_actual" + var actual: CatalogTable = null + val expected: CatalogTable = + sqlContext.sessionState.catalog.getTableMetadata(expectedTable) + withTempDir { tmpDir => + withTable(actualTable) { + var revisedActualDDL: String = null + if (expected.tableType == CatalogTableType.EXTERNAL_TABLE) { + revisedActualDDL = actualDDL.replace(expectedTable.table.toLowerCase(), actualTable) + } else { + revisedActualDDL = actualDDL + .replace(expectedTable.table.toLowerCase(), actualTable) + .replaceAll("path.*,", s"path '${tmpDir}',") + } + println(revisedActualDDL) + sql(revisedActualDDL) + actual = sqlContext.sessionState.catalog.getTableMetadata(TableIdentifier(actualTable)) + } + } + + if (expected.properties.get("spark.sql.sources.provider").isDefined) { + // datasource table: The generated DDL will be like: + // CREATE EXTERNAL TABLE () USING + // OPTIONS (