From caca69537a762e2850193819fc1eb218f8f20876 Mon Sep 17 00:00:00 2001 From: jerryshao Date: Mon, 21 Sep 2015 15:28:47 -0700 Subject: [PATCH 1/4] Add application attempt window for Spark on Yarn --- docs/running-on-yarn.md | 8 ++++++++ .../org/apache/spark/deploy/yarn/Client.scala | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 0e25ccf512c02..6e0f3e65f6525 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -303,6 +303,14 @@ If you need a reference to the proper location to put log files in the YARN so t It should be no larger than the global number of max attempts in the YARN configuration. + + spark.yarn.attemptFailuresValidityInterval + -1 + + Ignore the failure number which happens out the validity interval (in millisecond). + Default value -1 means this validity interval is not enabled. + + spark.yarn.submit.waitAppCompletion true diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index a2c4bc2f5480b..9e997f4239db7 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -185,6 +185,23 @@ private[spark] class Client( case None => logDebug("spark.yarn.maxAppAttempts is not set. " + "Cluster's default value will be used.") } + sparkConf.getOption("spark.yarn.attemptFailuresValidityInterval").map(_.toLong) match { + case Some(v) => + require(v > 0, "spark.yarn.attemptFailuresValidityInterval should be greater than 0") + try { + val method = appContext.getClass().getMethod( + "setAttemptFailuresValidityInterval", classOf[Long]) + method.invoke(appContext, v: java.lang.Long) + } catch { + case e: NoSuchMethodException => + logWarning("Ignoring spark.yarn.attemptFailuresValidityInterval because the version " + + "of YARN does not support it") + } + case None => + logDebug("spark.yarn.attemptFailuresValidityInterval is not set, " + + "only use spark.yarn.maxAppAppAttempts to control the application failure attempts") + } + val capability = Records.newRecord(classOf[Resource]) capability.setMemory(args.amMemory + amMemoryOverhead) capability.setVirtualCores(args.amCores) From 1c9afd058eaf495f3de9c69b901e8fb7aa9ade49 Mon Sep 17 00:00:00 2001 From: jerryshao Date: Sat, 10 Oct 2015 10:14:12 +0800 Subject: [PATCH 2/4] Address the comments --- docs/running-on-yarn.md | 10 ++++--- .../org/apache/spark/deploy/yarn/Client.scala | 27 +++++++++---------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 6e0f3e65f6525..e15cded1b9e03 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -304,11 +304,13 @@ If you need a reference to the proper location to put log files in the YARN so t - spark.yarn.attemptFailuresValidityInterval - -1 + spark.yarn.am.attemptFailuresValidityInterval + none - Ignore the failure number which happens out the validity interval (in millisecond). - Default value -1 means this validity interval is not enabled. + Defines the validity interval (in millisecond) for AM failure tracking. + If the AM has been running for at least long, the AM failure count will be reset. + Default value None means this validity interval is not enabled. + This feaure is only supported in Hadoop 2.6+. diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 9e997f4239db7..92243279782f2 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -185,21 +185,18 @@ private[spark] class Client( case None => logDebug("spark.yarn.maxAppAttempts is not set. " + "Cluster's default value will be used.") } - sparkConf.getOption("spark.yarn.attemptFailuresValidityInterval").map(_.toLong) match { - case Some(v) => - require(v > 0, "spark.yarn.attemptFailuresValidityInterval should be greater than 0") - try { - val method = appContext.getClass().getMethod( - "setAttemptFailuresValidityInterval", classOf[Long]) - method.invoke(appContext, v: java.lang.Long) - } catch { - case e: NoSuchMethodException => - logWarning("Ignoring spark.yarn.attemptFailuresValidityInterval because the version " + - "of YARN does not support it") - } - case None => - logDebug("spark.yarn.attemptFailuresValidityInterval is not set, " + - "only use spark.yarn.maxAppAppAttempts to control the application failure attempts") + + if (sparkConf.contains("spark.yarn.am.attemptFailuresValidityInterval")) { + try { + val interval = sparkConf.getTimeAsMs("spark.yarn.am.attemptFailuresValidityInterval") + val method = appContext.getClass().getMethod( + "setAttemptFailuresValidityInterval", classOf[Long]) + method.invoke(appContext, interval: java.lang.Long) + } catch { + case e: NoSuchMethodException => + logWarning("Ignoring spark.yarn.am.attemptFailuresValidityInterval because the version " + + "of YARN does not support it") + } } val capability = Records.newRecord(classOf[Resource]) From 7f9b77d00d1e7b8eabbf6a9fe2972e3263c7945f Mon Sep 17 00:00:00 2001 From: jerryshao Date: Sat, 10 Oct 2015 10:44:53 +0800 Subject: [PATCH 3/4] Style change --- docs/running-on-yarn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index e15cded1b9e03..28b99565cc6a0 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -305,7 +305,7 @@ If you need a reference to the proper location to put log files in the YARN so t spark.yarn.am.attemptFailuresValidityInterval - none + (none) Defines the validity interval (in millisecond) for AM failure tracking. If the AM has been running for at least long, the AM failure count will be reset. From 36eabdc48a8ed1f574e8655b509ae8d1c31877cd Mon Sep 17 00:00:00 2001 From: jerryshao Date: Sat, 10 Oct 2015 14:33:32 +0800 Subject: [PATCH 4/4] change the doc --- docs/running-on-yarn.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 28b99565cc6a0..972e83d7e9547 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -307,10 +307,9 @@ If you need a reference to the proper location to put log files in the YARN so t spark.yarn.am.attemptFailuresValidityInterval (none) - Defines the validity interval (in millisecond) for AM failure tracking. - If the AM has been running for at least long, the AM failure count will be reset. - Default value None means this validity interval is not enabled. - This feaure is only supported in Hadoop 2.6+. + Defines the validity interval for AM failure tracking. + If the AM has been running for at least defined interval, the AM failure count will be reset. + This feature is not enabled if not configured, and only supported in Hadoop 2.6+.