[SPARK-27136][SQL] Remove data source option check_files_exist

## What changes were proposed in this pull request? The data source option check_files_exist is introduced in In #23383 when the file source V2 framework is implemented. In the PR, FileIndex was created as a member of FileTable, so that we could implement partition pruning like 0f9fcab in the future. At that time `FileIndex`es will always be created for file writes, so we needed the option to decide whether to check file existence. After #23774, the option is not needed anymore, since Dataframe writes won't create unnecessary FileIndex. This PR is to remove the option. ## How was this patch tested? Unit test. Closes #24069 from gengliangwang/removeOptionCheckFilesExist. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
apache · Mar 15, 2019 · 6d22ee3 · rdblue · Mar 15, 2019 · gengliangwang
1 parent 8819eab
commit 6d22ee3
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 9 deletions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -213,9 +213,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         val objectMapper = new ObjectMapper()
         Some("paths" -> objectMapper.writeValueAsString(paths.toArray))
       }
-      // TODO SPARK-27113: remove this option.
-      val checkFilesExistsOpt = "check_files_exist" -> "true"
-      val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption + checkFilesExistsOpt
+
+      val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption
       val dsOptions = new CaseInsensitiveStringMap(finalOptions.asJava)
       val table = userSpecifiedSchema match {
         case Some(schema) => provider.getTable(dsOptions, schema)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -261,10 +261,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       val provider = cls.getConstructor().newInstance().asInstanceOf[TableProvider]
       val sessionOptions = DataSourceV2Utils.extractSessionConfigs(
         provider, session.sessionState.conf)
-      // TODO SPARK-27113: remove this option.
-      val checkFilesExistsOption = "check_files_exist" -> "false"
-      val options = sessionOptions ++ extraOptions + checkFilesExistsOption
+      val options = sessionOptions ++ extraOptions
       val dsOptions = new CaseInsensitiveStringMap(options.asJava)
+
       provider.getTable(dsOptions) match {
         case table: SupportsBatchWrite =>
           lazy val relation = DataSourceV2Relation.create(table, dsOptions)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
@@ -36,10 +36,8 @@ abstract class FileTable(
   lazy val fileIndex: PartitioningAwareFileIndex = {
     val scalaMap = options.asScala.toMap
     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(scalaMap)
-    // This is an internal config so must be present.
-    val checkFilesExist = options.get("check_files_exist").toBoolean
     val rootPathsSpecified = DataSource.checkAndGlobPathIfNecessary(paths, hadoopConf,
-      checkEmptyGlobPath = true, checkFilesExist = checkFilesExist)
+      checkEmptyGlobPath = true, checkFilesExist = true)
     val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
     new InMemoryFileIndex(
       sparkSession, rootPathsSpecified, scalaMap, userSpecifiedSchema, fileStatusCache)