-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-34472][YARN] Ship ivySettings file to driver in cluster mode #31591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0d62c5c
f3101b9
3f49d07
bf91027
998580e
a4d76d7
d99cc86
fab9f9e
fd3ddb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import java.io.{FileSystem => _, _} | |
| import java.net.{InetAddress, UnknownHostException, URI} | ||
| import java.nio.ByteBuffer | ||
| import java.nio.charset.StandardCharsets | ||
| import java.nio.file.Files | ||
| import java.util.{Locale, Properties, UUID} | ||
| import java.util.zip.{ZipEntry, ZipOutputStream} | ||
|
|
||
|
|
@@ -30,7 +31,6 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map} | |
| import scala.util.control.NonFatal | ||
|
|
||
| import com.google.common.base.Objects | ||
| import com.google.common.io.Files | ||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs._ | ||
| import org.apache.hadoop.fs.permission.FsPermission | ||
|
|
@@ -518,6 +518,32 @@ private[spark] class Client( | |
| require(localizedPath != null, "Keytab file already distributed.") | ||
| } | ||
|
|
||
| // If we passed in a ivySettings file, make sure we copy the file to the distributed cache | ||
| // in cluster mode so that the driver can access it | ||
| val ivySettings = sparkConf.getOption("spark.jars.ivySettings") | ||
| val ivySettingsLocalizedPath: Option[String] = ivySettings match { | ||
| case Some(ivySettingsPath) if isClusterMode => | ||
| val uri = new URI(ivySettingsPath) | ||
| Option(uri.getScheme).getOrElse("file") match { | ||
| case "file" => | ||
| val ivySettingsFile = new File(uri.getPath) | ||
| require(ivySettingsFile.exists(), s"Ivy settings file $ivySettingsFile not found") | ||
| require(ivySettingsFile.isFile(), s"Ivy settings file $ivySettingsFile is not a" + | ||
| "normal file") | ||
| // Generate a file name that can be used for the ivySettings file, that does not | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block of code (and the IllegalArgumentException below) -- convert to URI, check scheme, throw if bad scheme, assert on file existence -- is duplicated between Not too big of a duplication so if it can't be done cleanly no worries from my end.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The assertions of file existence are slightly different in both places. In YarnClient we would want to make this assertion only for the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SGTM thanks! |
||
| // conflict with any user file. | ||
| val localizedFileName = Some(ivySettingsFile.getName() + "-" + | ||
| UUID.randomUUID().toString) | ||
| val (_, localizedPath) = distribute(ivySettingsPath, destName = localizedFileName) | ||
| require(localizedPath != null, "IvySettings file already distributed.") | ||
| Some(localizedPath) | ||
| case scheme => | ||
| throw new IllegalArgumentException(s"Scheme $scheme not supported in " + | ||
| "spark.jars.ivySettings") | ||
| } | ||
| case _ => None | ||
| } | ||
|
|
||
| /** | ||
| * Add Spark to the cache. There are two settings that control what files to add to the cache: | ||
| * - if a Spark archive is defined, use the archive. The archive is expected to contain | ||
|
|
@@ -576,7 +602,7 @@ private[spark] class Client( | |
| jarsDir.listFiles().foreach { f => | ||
| if (f.isFile && f.getName.toLowerCase(Locale.ROOT).endsWith(".jar") && f.canRead) { | ||
| jarsStream.putNextEntry(new ZipEntry(f.getName)) | ||
| Files.copy(f, jarsStream) | ||
| Files.copy(f.toPath, jarsStream) | ||
| jarsStream.closeEntry() | ||
| } | ||
| } | ||
|
|
@@ -672,7 +698,18 @@ private[spark] class Client( | |
| val remoteFs = FileSystem.get(remoteConfArchivePath.toUri(), hadoopConf) | ||
| cachedResourcesConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString()) | ||
|
|
||
| val localConfArchive = new Path(createConfArchive().toURI()) | ||
| val confsToOverride = Map.empty[String, String] | ||
| // If propagating the keytab to the AM, override the keytab name with the name of the | ||
| // distributed file. | ||
| amKeytabFileName.foreach { kt => confsToOverride.put(KEYTAB.key, kt) } | ||
|
|
||
| // If propagating the ivySettings file to the distributed cache, override the ivySettings | ||
| // file name with the name of the distributed file. | ||
| ivySettingsLocalizedPath.foreach { path => | ||
| confsToOverride.put("spark.jars.ivySettings", path) | ||
| } | ||
|
|
||
| val localConfArchive = new Path(createConfArchive(confsToOverride).toURI()) | ||
| copyFileToRemote(destDir, localConfArchive, replication, symlinkCache, force = true, | ||
| destName = Some(LOCALIZED_CONF_ARCHIVE)) | ||
|
|
||
|
|
@@ -701,8 +738,10 @@ private[spark] class Client( | |
| * | ||
| * The archive also contains some Spark configuration. Namely, it saves the contents of | ||
| * SparkConf in a file to be loaded by the AM process. | ||
| * | ||
| * @param confsToOverride configs that should overriden when creating the final spark conf file | ||
| */ | ||
| private def createConfArchive(): File = { | ||
| private def createConfArchive(confsToOverride: Map[String, String]): File = { | ||
| val hadoopConfFiles = new HashMap[String, File]() | ||
|
|
||
| // SPARK_CONF_DIR shows up in the classpath before HADOOP_CONF_DIR/YARN_CONF_DIR | ||
|
|
@@ -764,7 +803,7 @@ private[spark] class Client( | |
| if url.getProtocol == "file" } { | ||
| val file = new File(url.getPath()) | ||
| confStream.putNextEntry(new ZipEntry(file.getName())) | ||
| Files.copy(file, confStream) | ||
| Files.copy(file.toPath, confStream) | ||
| confStream.closeEntry() | ||
| } | ||
|
|
||
|
|
@@ -775,7 +814,7 @@ private[spark] class Client( | |
| hadoopConfFiles.foreach { case (name, file) => | ||
| if (file.canRead()) { | ||
| confStream.putNextEntry(new ZipEntry(s"$LOCALIZED_HADOOP_CONF_DIR/$name")) | ||
| Files.copy(file, confStream) | ||
| Files.copy(file.toPath, confStream) | ||
| confStream.closeEntry() | ||
| } | ||
| } | ||
|
|
@@ -788,11 +827,7 @@ private[spark] class Client( | |
|
|
||
| // Save Spark configuration to a file in the archive. | ||
| val props = confToProperties(sparkConf) | ||
|
|
||
| // If propagating the keytab to the AM, override the keytab name with the name of the | ||
| // distributed file. | ||
| amKeytabFileName.foreach { kt => props.setProperty(KEYTAB.key, kt) } | ||
|
|
||
| confsToOverride.foreach { case (k, v) => props.setProperty(k, v)} | ||
| writePropertiesToArchive(props, SPARK_CONF_FILE, confStream) | ||
|
|
||
| // Write the distributed cache config to the archive. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ import com.google.common.io.{ByteStreams, Files} | |
| import org.apache.hadoop.yarn.conf.YarnConfiguration | ||
| import org.apache.hadoop.yarn.util.ConverterUtils | ||
| import org.scalatest.concurrent.Eventually._ | ||
| import org.scalatest.exceptions.TestFailedException | ||
| import org.scalatest.matchers.must.Matchers | ||
| import org.scalatest.matchers.should.Matchers._ | ||
|
|
||
|
|
@@ -368,6 +369,64 @@ class YarnClusterSuite extends BaseYarnClusterSuite { | |
| ) | ||
| checkResult(finalState, result, "true") | ||
| } | ||
|
|
||
| def createEmptyIvySettingsFile: File = { | ||
| val emptyIvySettings = File.createTempFile("ivy", ".xml") | ||
| Files.write("<ivysettings />", emptyIvySettings, StandardCharsets.UTF_8) | ||
| emptyIvySettings | ||
| } | ||
|
|
||
| test("SPARK-34472: ivySettings file with no scheme or file:// scheme should be " + | ||
| "localized on driver in cluster mode") { | ||
| val emptyIvySettings = createEmptyIvySettingsFile | ||
| // For file:// URIs or URIs without scheme, make sure that ivySettings conf was changed | ||
| // to the localized file. So the expected ivySettings path on the driver will start with | ||
| // the file name and then some random UUID suffix | ||
| testIvySettingsDistribution(clientMode = false, emptyIvySettings.getAbsolutePath, | ||
| emptyIvySettings.getName, prefixMatch = true) | ||
| testIvySettingsDistribution(clientMode = false, s"file://${emptyIvySettings.getAbsolutePath}", | ||
| emptyIvySettings.getName, prefixMatch = true) | ||
| } | ||
|
|
||
| test("SPARK-34472: ivySettings file with no scheme or file:// scheme should retain " + | ||
| "user provided path in client mode") { | ||
| val emptyIvySettings = createEmptyIvySettingsFile | ||
| // In client mode, the file is present locally on the driver and so does not need to be | ||
| // distributed. So the user provided path should be kept as is. | ||
| testIvySettingsDistribution(clientMode = true, emptyIvySettings.getAbsolutePath, | ||
| emptyIvySettings.getAbsolutePath) | ||
| testIvySettingsDistribution(clientMode = true, s"file://${emptyIvySettings.getAbsolutePath}", | ||
| s"file://${emptyIvySettings.getAbsolutePath}") | ||
| } | ||
|
|
||
| test("SPARK-34472: ivySettings file with non-file:// schemes should throw an error") { | ||
| val emptyIvySettings = createEmptyIvySettingsFile | ||
| val e1 = intercept[TestFailedException] { | ||
| testIvySettingsDistribution(clientMode = false, | ||
| s"local://${emptyIvySettings.getAbsolutePath}", "") | ||
| } | ||
| assert(e1.getMessage.contains("IllegalArgumentException: " + | ||
| "Scheme local not supported in spark.jars.ivySettings")) | ||
| val e2 = intercept[TestFailedException] { | ||
| testIvySettingsDistribution(clientMode = false, | ||
| s"hdfs://${emptyIvySettings.getAbsolutePath}", "") | ||
| } | ||
| assert(e2.getMessage.contains("IllegalArgumentException: " + | ||
| "Scheme hdfs not supported in spark.jars.ivySettings")) | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use NIO for these? No need for commons-io now that NIO supports this kind of stuff built-in.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed this to use Guava's Files which is used in many places within this file. Can I create a followup PR to replace these with NIO?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just saw this comment, sounds fine to me. |
||
| def testIvySettingsDistribution(clientMode: Boolean, ivySettingsPath: String, | ||
| expectedIvySettingsPrefixOnDriver: String, prefixMatch: Boolean = false): Unit = { | ||
| val result = File.createTempFile("result", null, tempDir) | ||
| val outFile = File.createTempFile("out", null, tempDir) | ||
| val finalState = runSpark(clientMode = clientMode, | ||
| mainClassName(YarnAddJarTest.getClass), | ||
| appArgs = Seq(result.getAbsolutePath, expectedIvySettingsPrefixOnDriver, | ||
| prefixMatch.toString), | ||
| extraConf = Map("spark.jars.ivySettings" -> ivySettingsPath), | ||
| outFile = Option(outFile)) | ||
| checkResult(finalState, result, outFile = Option(outFile)) | ||
| } | ||
| } | ||
|
|
||
| private[spark] class SaveExecutorInfo extends SparkListener { | ||
|
|
@@ -583,6 +642,50 @@ private object YarnClasspathTest extends Logging { | |
|
|
||
| } | ||
|
|
||
| private object YarnAddJarTest extends Logging { | ||
shardulm94 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def main(args: Array[String]): Unit = { | ||
| if (args.length != 3) { | ||
| // scalastyle:off println | ||
| System.err.println( | ||
| s""" | ||
| |Invalid command line: ${args.mkString(" ")} | ||
| | | ||
| |Usage: YarnAddJarTest [result file] [expected ivy settings path] [prefix match] | ||
| """.stripMargin) | ||
| // scalastyle:on println | ||
| System.exit(1) | ||
| } | ||
|
|
||
| val resultPath = args(0) | ||
| val expectedIvySettingsPath = args(1) | ||
| val prefixMatch = args(2).toBoolean | ||
| val sc = new SparkContext(new SparkConf()) | ||
|
|
||
| var result = "failure" | ||
| try { | ||
| val settingsFile = sc.getConf.get("spark.jars.ivySettings") | ||
| if (prefixMatch) { | ||
| assert(settingsFile !== expectedIvySettingsPath) | ||
| assert(settingsFile.startsWith(expectedIvySettingsPath)) | ||
| } else { | ||
| assert(settingsFile === expectedIvySettingsPath) | ||
| } | ||
|
|
||
| val caught = intercept[RuntimeException] { | ||
| sc.addJar("ivy://org.fake-project.test:test:1.0.0") | ||
| } | ||
| if (caught.getMessage.contains("unresolved dependency: org.fake-project.test#test")) { | ||
| // "unresolved dependency" is expected as the dependency does not exist | ||
| // but exception like "Ivy settings file <file> does not exist" should result in failure | ||
| result = "success" | ||
| } | ||
| } finally { | ||
| Files.write(result, new File(resultPath), StandardCharsets.UTF_8) | ||
| sc.stop() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private object YarnLauncherTestApp { | ||
|
|
||
| def main(args: Array[String]): Unit = { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shardulm94, does it work if you set
spark.jars.ivySettingsto./ivysettings.xmland passivysettings.xmltospark.yarn.files?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know you might have to copy
ivysettings.xmlto the current working directory when Spark submit runs but I think it might work for your usecase.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, as you pointed out it only works if I have the copy in the current working directory. However we cannot control which directory our users launch spark-submit from. So ideally we would want something which works without user intervention.
What do you think of this? In Yarn Client, we can add the ivySettings file to
spark.yarn.dist.filesor__spark__conf__.zipand then we can modify the propertyspark.jars.ivySettingsto change it to./ivysettings.xmlor__spark__conf__/ivysettings.xmlwithin Yarn Client.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding it into
__spark__conf__sounds okay to me ... but I would prefer to have second opinions from Yarn experts such as @tgravescs, @mridulm or @jerryshao.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried this out, and it looks like it can be handled pretty cleanly this way targeting just YARN. shardulm94@12709f0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated the PR with this approach.