address comments

gatorsmile · gatorsmile · commit a043ca28fc06 · 2016-07-19T11:44:51.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -223,9 +223,6 @@ abstract class Catalog {
    * If this table is cached as an InMemoryRelation, drop the original cached version and make the
    * new version cached lazily.
    *
-   * If the table's schema is inferred at runtime, infer the schema again and update the schema
-   * in the external catalog.
-   *
    * @since 2.0.0
    */
   def refreshTable(tableName: String): Unit
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -127,7 +127,6 @@ case class CreateDataSourceTableCommand(
       sparkSession = sparkSession,
       tableIdent = tableIdent,
       schema = dataSource.schema,
-      isSchemaInferred = userSpecifiedSchema.isEmpty,
       partitionColumns = partitionColumns,
       bucketSpec = bucketSpec,
       provider = provider,
@@ -279,7 +278,6 @@ case class CreateDataSourceTableAsSelectCommand(
         sparkSession = sparkSession,
         tableIdent = tableIdent,
         schema = result.schema,
-        isSchemaInferred = false,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
         provider = provider,
@@ -293,6 +291,7 @@ case class CreateDataSourceTableAsSelectCommand(
   }
 }
 
+
 object CreateDataSourceTableUtils extends Logging {
 
   val DATASOURCE_PREFIX = "spark.sql.sources."
@@ -301,7 +300,6 @@ object CreateDataSourceTableUtils extends Logging {
   val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path"
   val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema"
   val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "."
-  val DATASOURCE_SCHEMA_ISINFERRED = DATASOURCE_SCHEMA_PREFIX + "isInferred"
   val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts"
   val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols"
   val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols"
@@ -326,18 +324,21 @@ object CreateDataSourceTableUtils extends Logging {
     matcher.matches()
   }
 
-  /**
-   * Saves the schema (including partition info) into the table properties.
-   * Overwrites the schema, if already existed.
-   */
-  def saveSchema(
+  def createDataSourceTable(
       sparkSession: SparkSession,
+      tableIdent: TableIdentifier,
       schema: StructType,
       partitionColumns: Array[String],
-      tableProperties: mutable.HashMap[String, String]): Unit = {
-    // Serialized JSON schema string may be too long to be stored into a single
-    // metastore SerDe property.  In this case, we split the JSON string and store each part as
-    // a separate table property.
+      bucketSpec: Option[BucketSpec],
+      provider: String,
+      options: Map[String, String],
+      isExternal: Boolean): Unit = {
+    val tableProperties = new mutable.HashMap[String, String]
+    tableProperties.put(DATASOURCE_PROVIDER, provider)
+
+    // Saves optional user specified schema.  Serialized JSON schema string may be too long to be
+    // stored into a single metastore SerDe property.  In this case, we split the JSON string and
+    // store each part as a separate SerDe property.
     val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold
     val schemaJsonString = schema.json
     // Split the JSON string.
@@ -353,23 +354,6 @@ object CreateDataSourceTableUtils extends Logging {
         tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
       }
     }
-  }
-
-  def createDataSourceTable(
-      sparkSession: SparkSession,
-      tableIdent: TableIdentifier,
-      schema: StructType,
-      isSchemaInferred: Boolean,
-      partitionColumns: Array[String],
-      bucketSpec: Option[BucketSpec],
-      provider: String,
-      options: Map[String, String],
-      isExternal: Boolean): Unit = {
-    val tableProperties = new mutable.HashMap[String, String]
-    tableProperties.put(DATASOURCE_PROVIDER, provider)
-
-    tableProperties.put(DATASOURCE_SCHEMA_ISINFERRED, isSchemaInferred.toString.toUpperCase)
-    saveSchema(sparkSession, schema, partitionColumns, tableProperties)
 
     if (bucketSpec.isDefined) {
       val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -487,10 +487,6 @@ object DDLUtils {
     isDatasourceTable(table.properties)
   }
 
-  def isSchemaInferred(table: CatalogTable): Boolean = {
-    table.properties.get(DATASOURCE_SCHEMA_ISINFERRED) == Option(true.toString.toUpperCase)
-  }
-
   /**
    * If the command ALTER VIEW is to alter a table or ALTER TABLE is to alter a view,
    * issue an exception [[AnalysisException]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.internal
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.Experimental
@@ -28,8 +27,7 @@ import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, TableIdentifie
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.command.{CreateDataSourceTableUtils, DDLUtils}
-import org.apache.spark.sql.execution.datasources.{CreateTableUsing, DataSource, HadoopFsRelation}
+import org.apache.spark.sql.execution.datasources.CreateTableUsing
 import org.apache.spark.sql.types.StructType
 
 
@@ -352,68 +350,15 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     sparkSession.sharedState.cacheManager.lookupCachedData(qName).nonEmpty
   }
 
-  /**
-   * Refresh the inferred schema stored in the external catalog for data source tables.
-   */
-  private def refreshInferredSchema(tableIdent: TableIdentifier): Unit = {
-    val table = sessionCatalog.getTableMetadataOption(tableIdent)
-    table.foreach { tableDesc =>
-      if (DDLUtils.isDatasourceTable(tableDesc) && DDLUtils.isSchemaInferred(tableDesc)) {
-        val partitionColumns = DDLUtils.getPartitionColumnsFromTableProperties(tableDesc)
-        val bucketSpec = DDLUtils.getBucketSpecFromTableProperties(tableDesc)
-        val dataSource =
-          DataSource(
-            sparkSession,
-            userSpecifiedSchema = None,
-            partitionColumns = partitionColumns,
-            bucketSpec = bucketSpec,
-            className = tableDesc.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER),
-            options = tableDesc.storage.serdeProperties)
-            .resolveRelation().asInstanceOf[HadoopFsRelation]
-
-        val schemaProperties = new mutable.HashMap[String, String]
-        CreateDataSourceTableUtils.saveSchema(
-          sparkSession, dataSource.schema, dataSource.partitionSchema.fieldNames, schemaProperties)
-
-        def isPropertyForInferredSchema(key: String): Boolean = {
-          key match {
-            case CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTS => true
-            case CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTCOLS => true
-            case _
-              if key.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PART_PREFIX) ||
-                key.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PARTCOL_PREFIX)
-              => true
-            case _ => false
-          }
-        }
-
-        // Keep the properties that are not for schema or partition columns
-        val tablePropertiesWithoutSchema = tableDesc.properties.filterKeys { k =>
-          !isPropertyForInferredSchema(k)
-        }
-
-        val newTable = tableDesc.copy(properties = tablePropertiesWithoutSchema ++ schemaProperties)
-
-        // Alter the schema-related table properties that are stored in external catalog.
-        sessionCatalog.alterTable(newTable)
-      }
-    }
-  }
-
   /**
    * Refresh the cache entry for a table, if any. For Hive metastore table, the metadata
-   * is refreshed.
+   * is refreshed. For data source tables, the schema will not be inferred and refreshed.
    *
    * @group cachemgmt
    * @since 2.0.0
    */
   override def refreshTable(tableName: String): Unit = {
     val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
-    // Refresh the schema in external catalog, if it is a data source table whose schema is inferred
-    // at runtime. For user-specified schema, we do not infer and update the schema.
-    // TODO: Support column-related ALTER TABLE DDL commands, and then users can update
-    // the user-specified schema.
-    refreshInferredSchema(tableIdent)
     // Temp tables: refresh (or invalidate) any metadata/data cached in the plan recursively.
     // Non-temp tables: refresh the metadata cache.
     sessionCatalog.refreshTable(tableIdent)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -336,9 +336,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           val newDF = sparkContext.parallelize(1 to 10).map(i => (i, i.toString))
             .toDF("newCol1", "newCol2")
           newDF.write.format("json").partitionBy("newCol1").mode(SaveMode.Overwrite).save(path)
-          val newSchema = StructType(
-            StructField("newCol2", StringType, nullable = true) ::
-              StructField("newCol1", IntegerType, nullable = true) :: Nil)
 
           // No change on the schema
           val tableMetadataBeforeRefresh = catalog.getTableMetadata(TableIdentifier(tabName))
@@ -349,27 +346,16 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
             DDLUtils.getPartitionColumnsFromTableProperties(tableMetadataBeforeRefresh)
           assert(partColsBeforeRefresh == partitionCols)
 
-          // Refresh
+          // Refresh does not affect the schema
           spark.catalog.refreshTable(tabName)
 
           val tableMetadataAfterRefresh = catalog.getTableMetadata(TableIdentifier(tabName))
           val tableSchemaAfterRefresh =
             DDLUtils.getSchemaFromTableProperties(tableMetadataAfterRefresh)
-          assert(tableSchemaAfterRefresh == Option(newSchema))
+          assert(tableSchemaAfterRefresh == Option(schema))
           val partColsAfterRefresh =
             DDLUtils.getPartitionColumnsFromTableProperties(tableMetadataAfterRefresh)
-          assert(partColsAfterRefresh == Seq("newCol1"))
-
-          // Refresh after no change
-          spark.catalog.refreshTable(tabName)
-
-          val tableMetadataNoChangeAfterRefresh = catalog.getTableMetadata(TableIdentifier(tabName))
-          val tableSchemaNoChangeAfterRefresh =
-            DDLUtils.getSchemaFromTableProperties(tableMetadataNoChangeAfterRefresh)
-          assert(tableSchemaNoChangeAfterRefresh == Option(newSchema))
-          val partColsNoChangeAfterRefresh =
-            DDLUtils.getPartitionColumnsFromTableProperties(tableMetadataNoChangeAfterRefresh)
-          assert(partColsNoChangeAfterRefresh == Seq("newCol1"))
+          assert(partColsAfterRefresh == partitionCols)
         }
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -191,10 +191,10 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
 
         sql("REFRESH TABLE jsonTable")
 
-        // Check that the refresh worked
+        // After refresh, schema is not changed.
         checkAnswer(
           sql("SELECT * FROM jsonTable"),
-          Row("a1", "b1", "c1"))
+          Row("a1", "b1"))
       }
     }
   }
@@ -704,7 +704,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
             sparkSession = spark,
             tableIdent = TableIdentifier("wide_schema"),
             schema = schema,
-            isSchemaInferred = false,
             partitionColumns = Array.empty[String],
             bucketSpec = None,
             provider = "json",
@@ -990,7 +989,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         sparkSession = spark,
         tableIdent = TableIdentifier("not_skip_hive_metadata"),
         schema = schema,
-        isSchemaInferred = false,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         provider = "parquet",
@@ -1006,7 +1004,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         sparkSession = spark,
         tableIdent = TableIdentifier("skip_hive_metadata"),
         schema = schema,
-        isSchemaInferred = false,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         provider = "parquet",