-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-17361][SQL] file-based external table without path should not be created #14921
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2533d65
eefe3bc
600ba8c
e42a0ad
4071bec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -315,12 +315,8 @@ case class DataSource( | |
| /** | ||
| * Create a resolved [[BaseRelation]] that can be used to read data from or write data into this | ||
| * [[DataSource]] | ||
| * | ||
| * @param checkPathExist A flag to indicate whether to check the existence of path or not. | ||
| * This flag will be set to false when we create an empty table (the | ||
| * path of the table does not exist). | ||
| */ | ||
| def resolveRelation(checkPathExist: Boolean = true): BaseRelation = { | ||
| def resolveRelation(): BaseRelation = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checked with Wenchen, it is not safe to skip calling resolveRelation() when it is a managed table.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For example, if it is a JDBC Relation provider, we will call
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @clockfly Sorry, I did not get your point. What you said above is only for the read path, right? The changes we did here is for the write path.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI, today, I just updated the write path for JDBC connection. #14077
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gatorsmile I means write path. When createRelation() is called on a RelationProvider, RelationProvider may do some extra check to make sure the options provided are valid. We'd better enforce the check when trying to create a managed table. For example, JdbcRelationProvider will validate the options
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I said before is wrong, managed table still need to call
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a data source wants to implement a write path (
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on my understanding,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After a discussion with Wenchen,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify it, |
||
| val caseInsensitiveOptions = new CaseInsensitiveMap(options) | ||
| val relation = (providingClass.newInstance(), userSpecifiedSchema) match { | ||
| // TODO: Throw when too much is given. | ||
|
|
@@ -367,11 +363,11 @@ case class DataSource( | |
| val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) | ||
| val globPath = SparkHadoopUtil.get.globPathIfNecessary(qualified) | ||
|
|
||
| if (checkPathExist && globPath.isEmpty) { | ||
| if (globPath.isEmpty) { | ||
| throw new AnalysisException(s"Path does not exist: $qualified") | ||
| } | ||
| // Sufficient to check head of the globPath seq for non-glob scenario | ||
| if (checkPathExist && !fs.exists(globPath.head)) { | ||
| if (!fs.exists(globPath.head)) { | ||
| throw new AnalysisException(s"Path does not exist: ${globPath.head}") | ||
| } | ||
| globPath | ||
|
|
@@ -391,7 +387,7 @@ case class DataSource( | |
|
|
||
| val fileCatalog = | ||
| new ListingFileCatalog( | ||
| sparkSession, globbedPaths, options, partitionSchema, !checkPathExist) | ||
| sparkSession, globbedPaths, options, partitionSchema) | ||
|
|
||
| val dataSchema = userSpecifiedSchema.map { schema => | ||
| val equality = sparkSession.sessionState.conf.resolver | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.io.FileNotFoundException | ||
|
|
||
| import scala.collection.mutable | ||
| import scala.util.Try | ||
|
|
||
| import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} | ||
| import org.apache.hadoop.mapred.{FileInputFormat, JobConf} | ||
|
|
@@ -37,16 +34,12 @@ import org.apache.spark.sql.types.StructType | |
| * @param paths a list of paths to scan | ||
| * @param partitionSchema an optional partition schema that will be use to provide types for the | ||
| * discovered partitions | ||
| * @param ignoreFileNotFound if true, return empty file list when encountering a | ||
| * [[FileNotFoundException]] in file listing. Note that this is a hack | ||
| * for SPARK-16313. We should get rid of this flag in the future. | ||
| */ | ||
| class ListingFileCatalog( | ||
| sparkSession: SparkSession, | ||
| override val paths: Seq[Path], | ||
| parameters: Map[String, String], | ||
| partitionSchema: Option[StructType], | ||
| ignoreFileNotFound: Boolean = false) | ||
| partitionSchema: Option[StructType]) | ||
|
||
| extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) { | ||
|
|
||
| @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _ | ||
|
|
@@ -88,7 +81,7 @@ class ListingFileCatalog( | |
| */ | ||
| def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { | ||
| if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { | ||
| HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession, ignoreFileNotFound) | ||
| HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession) | ||
| } else { | ||
| // Right now, the number of paths is less than the value of | ||
| // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver. | ||
|
|
@@ -104,12 +97,7 @@ class ListingFileCatalog( | |
| logTrace(s"Listing $path on driver") | ||
|
|
||
| val childStatuses = { | ||
| val stats = | ||
| try { | ||
| fs.listStatus(path) | ||
| } catch { | ||
| case e: FileNotFoundException if ignoreFileNotFound => Array.empty[FileStatus] | ||
| } | ||
| val stats = fs.listStatus(path) | ||
| if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have renamed
CatalogStorageFormat.serdePropertiestoproperties, this should also be updated.