-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16787] SparkContext.addFile() should not throw if called twice with the same file #14396
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
99d9855
3ecdb88
c412f99
b98d149
0d7dd0d
9aa32b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ import java.io._ | |
| import java.lang.reflect.Constructor | ||
| import java.net.URI | ||
| import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID} | ||
| import java.util.concurrent.ConcurrentMap | ||
| import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap} | ||
| import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference} | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
@@ -262,8 +262,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| private[spark] def env: SparkEnv = _env | ||
|
|
||
| // Used to store a URL for each static file/jar together with the file's local timestamp | ||
| private[spark] val addedFiles = HashMap[String, Long]() | ||
| private[spark] val addedJars = HashMap[String, Long]() | ||
| private[spark] val addedFiles = new ConcurrentHashMap[String, Long]().asScala | ||
| private[spark] val addedJars = new ConcurrentHashMap[String, Long]().asScala | ||
|
|
||
| // Keeps track of all persisted RDDs | ||
| private[spark] val persistentRdds = { | ||
|
|
@@ -1430,14 +1430,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| schemeCorrectedPath | ||
| } | ||
| val timestamp = System.currentTimeMillis | ||
| addedFiles(key) = timestamp | ||
|
|
||
| // Fetch the file locally in case a job is executed using DAGScheduler.runLocally(). | ||
| Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, | ||
| hadoopConfiguration, timestamp, useCache = false) | ||
|
|
||
| logInfo("Added file " + path + " at " + key + " with timestamp " + addedFiles(key)) | ||
| postEnvironmentUpdate() | ||
| if (addedFiles.putIfAbsent(key, timestamp).isEmpty) { | ||
| logInfo(s"Added file $path at $key with timestamp $timestamp") | ||
| // Fetch the file locally so that closures which are run on the driver can still use the | ||
| // SparkFiles API to access files. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe that this line is unnecessary now that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, turns out this is needed in the Python tests so I added it back. |
||
| Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, | ||
| hadoopConfiguration, timestamp, useCache = false) | ||
| postEnvironmentUpdate() | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1705,12 +1705,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| case exc: FileNotFoundException => | ||
| logError(s"Jar not found at $path") | ||
| null | ||
| case e: Exception => | ||
| // For now just log an error but allow to go through so spark examples work. | ||
| // The spark examples don't really need the jar distributed since its also | ||
| // the app jar. | ||
| logError("Error adding jar (" + e + "), was the --addJars option used?") | ||
| null | ||
| } | ||
| } | ||
| // A JAR file which exists locally on every worker node | ||
|
|
@@ -1721,11 +1715,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| } | ||
| } | ||
| if (key != null) { | ||
| addedJars(key) = System.currentTimeMillis | ||
| logInfo("Added JAR " + path + " at " + key + " with timestamp " + addedJars(key)) | ||
| val timestamp = System.currentTimeMillis | ||
| if (addedJars.putIfAbsent(key, timestamp).isEmpty) { | ||
| logInfo(s"Added JAR $path at $key with timestamp $timestamp") | ||
| postEnvironmentUpdate() | ||
| } | ||
| } | ||
| } | ||
| postEnvironmentUpdate() | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is
putIfAbsentcorrect here? It won't update the timestamp when you calladdFilefor the same file again.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nevermind, I see you're actually changing the behavior from 1.x instead of restoring it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this is actually intentional. I experimented with implementing the 1.x behavior and started writing some tests to verify that newer versions of files took precedence over old ones but then discovered that Spark executors will crash with exceptions if they've downloaded a file with a given name and the new file's contents don't match the old file. Given this behavior it seems that updating the timestamp will work only if the new file has the same contents as the old file (in which case it doesn't matter what we do with the timestamp) or if all executors are replaced before running any further tasks (which seems like an obscure use-case that we don't want to optimize for).