apache · squito · Dec 29, 2015 · May 10, 2016 · Jul 7, 2016 · Jul 7, 2016
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -114,11 +114,21 @@ package object config {
       .intConf
       .createWithDefault(2)
 
+  private[spark] val MAX_FAILURES_PER_EXEC =
+    ConfigBuilder("spark.blacklist.application.maxFailedTasksPerExecutor")
+      .intConf
+      .createWithDefault(2)
+
   private[spark] val MAX_FAILURES_PER_EXEC_STAGE =
     ConfigBuilder("spark.blacklist.stage.maxFailedTasksPerExecutor")
       .intConf
       .createWithDefault(2)
 
+  private[spark] val MAX_FAILED_EXEC_PER_NODE =
+    ConfigBuilder("spark.blacklist.application.maxFailedExecutorsPerNode")
+      .intConf
+      .createWithDefault(2)
+
   private[spark] val MAX_FAILED_EXEC_PER_NODE_STAGE =
     ConfigBuilder("spark.blacklist.stage.maxFailedExecutorsPerNode")
       .intConf

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -17,10 +17,274 @@
 
 package org.apache.spark.scheduler
 
+import java.util.concurrent.atomic.AtomicReference
+
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{Clock, SystemClock, Utils}
+
+/**
+ * BlacklistTracker is designed to track problematic executors and nodes.  It supports blacklisting
+ * executors and nodes across an entire application (with a periodic expiry).  TaskSetManagers add
+ * additional blacklisting of executors and nodes for individual tasks and stages which works in
+ * concert with the blacklisting here.
+ *
+ * The tracker needs to deal with a variety of workloads, eg.:
+ *
+ *  * bad user code --  this may lead to many task failures, but that should not count against
+ *      individual executors
+ *  * many small stages -- this may prevent a bad executor for having many failures within one
+ *      stage, but still many failures over the entire application
+ *  * "flaky" executors -- they don't fail every task, but are still faulty enough to merit
+ *      blacklisting
+ *
+ * See the design doc on SPARK-8425 for a more in-depth discussion.
+ *
+ * THREADING: As with most helpers of TaskSchedulerImpl, this is not thread-safe.  Though it is
+ * called by multiple threads, callers must already have a lock on the TaskSchedulerImpl.  The
+ * one exception is [[nodeBlacklist()]], which can be called without holding a lock.
+ */
+private[scheduler] class BlacklistTracker (
+    conf: SparkConf,
+    clock: Clock = new SystemClock()) extends Logging {
+
+  BlacklistTracker.validateBlacklistConfs(conf)
+  private val MAX_FAILURES_PER_EXEC = conf.get(config.MAX_FAILURES_PER_EXEC)
+  private val MAX_FAILED_EXEC_PER_NODE = conf.get(config.MAX_FAILED_EXEC_PER_NODE)
+  val BLACKLIST_TIMEOUT_MILLIS = BlacklistTracker.getBlacklistTimeout(conf)
+
+  /**
+   * A map from executorId to information on task failures.  Tracks the time of each task failure,
+   * so that we can avoid blacklisting executors due to failures that are very far apart.  We do not
+   * actively remove from this as soon as tasks hit their timeouts, to avoid the time it would take
+   * to do so.  But it will not grow too large, because as soon as an executor gets too many
+   * failures, we blacklist the executor and remove its entry here.
+   */
+  private val executorIdToFailureList = new HashMap[String, ExecutorFailureList]()
+  val executorIdToBlacklistStatus = new HashMap[String, BlacklistedExecutor]()
+  val nodeIdToBlacklistExpiryTime = new HashMap[String, Long]()
+  /**
+   * An immutable copy of the set of nodes that are currently blacklisted.  Kept in an
+   * AtomicReference to make [[nodeBlacklist()]] thread-safe.
+   */
+  private val _nodeBlacklist = new AtomicReference[Set[String]](Set())
+  /**
+   * Time when the next blacklist will expire.  Used as a
+   * shortcut to avoid iterating over all entries in the blacklist when none will have expired.
+   */
+  var nextExpiryTime: Long = Long.MaxValue
+  /**
+   * Mapping from nodes to all of the executors that have been blacklisted on that node. We do *not*
+   * remove from this when executors are removed from spark, so we can track when we get multiple
+   * successive blacklisted executors on one node.  Nonetheless, it will not grow too large because
+   * there cannot be many blacklisted executors on one node, before we stop requesting more
+   * executors on that node, and we clean up the list of blacklisted executors once an executor has
+   * been blacklisted for BLACKLIST_TIMEOUT_MILLIS.
+   */
+  val nodeToBlacklistedExecs = new HashMap[String, HashSet[String]]()
+
+  /**
+   * Un-blacklists executors and nodes that have been blacklisted for at least
+   * BLACKLIST_TIMEOUT_MILLIS
+   */
+  def applyBlacklistTimeout(): Unit = {
+    val now = clock.getTimeMillis()
+    // quickly check if we've got anything to expire from blacklist -- if not, avoid doing any work
+    if (now > nextExpiryTime) {
+      // Apply the timeout to blacklisted nodes and executors
+      val execsToUnblacklist = executorIdToBlacklistStatus.filter(_._2.expiryTime < now).keys
+      if (execsToUnblacklist.nonEmpty) {
+        // Un-blacklist any executors that have been blacklisted longer than the blacklist timeout.
+        logInfo(s"Removing executors $execsToUnblacklist from blacklist because the blacklist " +
+          s"for those executors has timed out")
+        execsToUnblacklist.foreach { exec =>
+          val status = executorIdToBlacklistStatus.remove(exec).get
+          val failedExecsOnNode = nodeToBlacklistedExecs(status.node)
+          failedExecsOnNode.remove(exec)
+          if (failedExecsOnNode.isEmpty) {
+            nodeToBlacklistedExecs.remove(status.node)
+          }
+        }
+      }
+      val nodesToUnblacklist = nodeIdToBlacklistExpiryTime.filter(_._2 < now).keys
+      if (nodesToUnblacklist.nonEmpty) {
+        // Un-blacklist any nodes that have been blacklisted longer than the blacklist timeout.
+        logInfo(s"Removing nodes $nodesToUnblacklist from blacklist because the blacklist " +
+          s"has timed out")
+        nodeIdToBlacklistExpiryTime --= nodesToUnblacklist
+        _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+      }
+      updateNextExpiryTime()
+    }
+  }
+
+  private def updateNextExpiryTime(): Unit = {
+    val execMinExpiry = if (executorIdToBlacklistStatus.nonEmpty) {
+      executorIdToBlacklistStatus.map{_._2.expiryTime}.min
+    } else {
+      Long.MaxValue
+    }
+    val nodeMinExpiry = if (nodeIdToBlacklistExpiryTime.nonEmpty) {
+      nodeIdToBlacklistExpiryTime.values.min
+    } else {
+      Long.MaxValue
+    }
+    nextExpiryTime = math.min(execMinExpiry, nodeMinExpiry)
+  }
+
+
+  def updateBlacklistForSuccessfulTaskSet(
+      stageId: Int,
+      stageAttemptId: Int,
+      failuresByExec: HashMap[String, ExecutorFailuresInTaskSet]): Unit = {
+    // if any tasks failed, we count them towards the overall failure count for the executor at
+    // this point.
+    val now = clock.getTimeMillis()
+    failuresByExec.foreach { case (exec, failuresInTaskSet) =>
+      val appFailuresOnExecutor =
+        executorIdToFailureList.getOrElseUpdate(exec, new ExecutorFailureList)
+      appFailuresOnExecutor.addFailures(stageId, stageAttemptId, failuresInTaskSet)
+      appFailuresOnExecutor.dropFailuresWithTimeoutBefore(now)
+      val newTotal = appFailuresOnExecutor.numUniqueTaskFailures
+
+      val expiryTimeForNewBlacklists = now + BLACKLIST_TIMEOUT_MILLIS
+      // If this pushes the total number of failures over the threshold, blacklist the executor.
+      // If its already blacklisted, we avoid "re-blacklisting" (which can happen if there were
+      // other tasks already running in another taskset when it got blacklisted), because it makes
+      // some of the logic around expiry times a little more confusing.  But it also wouldn't be a
+      // problem to re-blacklist, with a later expiry time.
+      if (newTotal >= MAX_FAILURES_PER_EXEC && !executorIdToBlacklistStatus.contains(exec)) {
+        logInfo(s"Blacklisting executor id: $exec because it has $newTotal" +
+          s" task failures in successful task sets")
+        val node = failuresInTaskSet.node
+        executorIdToBlacklistStatus.put(exec, BlacklistedExecutor(node, expiryTimeForNewBlacklists))
+        updateNextExpiryTime()
+
+        // In addition to blacklisting the executor, we also update the data for failures on the
+        // node, and potentially put the entire node into a blacklist as well.
+        val blacklistedExecsOnNode = nodeToBlacklistedExecs.getOrElseUpdate(node, HashSet[String]())
+        blacklistedExecsOnNode += exec
+        // If the node is already in the blacklist, we avoid adding it again with a later expiry
+        // time.
+        if (blacklistedExecsOnNode.size >= MAX_FAILED_EXEC_PER_NODE &&
+            !nodeIdToBlacklistExpiryTime.contains(node)) {
+          logInfo(s"Blacklisting node $node because it has ${blacklistedExecsOnNode.size} " +
+            s"executors blacklisted: ${blacklistedExecsOnNode}")
+          nodeIdToBlacklistExpiryTime.put(node, expiryTimeForNewBlacklists)
+          _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+        }
+      }
+    }
+  }
+
+  def isExecutorBlacklisted(executorId: String): Boolean = {
+    executorIdToBlacklistStatus.contains(executorId)
+  }
+
+  /**
+   * Get the full set of nodes that are blacklisted.  Unlike other methods in this class, this *IS*
+   * thread-safe -- no lock required on a taskScheduler.
+   */
+  def nodeBlacklist(): Set[String] = {
+    _nodeBlacklist.get()
+  }
+
+  def isNodeBlacklisted(node: String): Boolean = {
+    nodeIdToBlacklistExpiryTime.contains(node)
+  }
+
+  def handleRemovedExecutor(executorId: String): Unit = {
+    // We intentionally do not clean up executors that are already blacklisted in
+    // nodeToBlacklistedExecs, so that if another executor on the same node gets blacklisted, we can
+    // blacklist the entire node.  We also can't clean up executorIdToBlacklistStatus, so we can
+    // eventually remove the executor after the timeout.  Despite not clearing those structures
+    // here, we don't expect they will grow too big since you won't get too many executors on one
+    // node, and the timeout will clear it up periodically in any case.
+    executorIdToFailureList -= executorId
+  }
+
+
+  /**
+   * Tracks all failures for one executor (that have not passed the timeout).
+   *
+   * In general we actually expect this to be extremely small, since it won't contain more than the
+   * maximum number of task failures before an executor is failed (default 2).
+   */
+  private[scheduler] final class ExecutorFailureList extends Logging {
+
+    private case class TaskId(stage: Int, stageAttempt: Int, taskIndex: Int)
+
+    /**
+     * All failures on this executor in successful task sets.
+     */
+    private var failuresAndExpiryTimes = ArrayBuffer[(TaskId, Long)]()
+    /**
+     * As an optimization, we track the min expiry time over all entries in failuresAndExpiryTimes
+     * so its quick to tell if there are any failures with expiry before the current time.
+     */
+    private var minExpiryTime = Long.MaxValue
+
+    def addFailures(
+        stage: Int,
+        stageAttempt: Int,
+        failuresInTaskSet: ExecutorFailuresInTaskSet): Unit = {
+      failuresInTaskSet.taskToFailureCountAndFailureTime.foreach {
+        case (taskIdx, (_, failureTime)) =>
+          val expiryTime = failureTime + BLACKLIST_TIMEOUT_MILLIS
+          failuresAndExpiryTimes += ((TaskId(stage, stageAttempt, taskIdx), expiryTime))
+          if (expiryTime < minExpiryTime) {
+            minExpiryTime = expiryTime
+          }
+      }
+    }
+
+    /**
+     * The number of unique tasks that failed on this executor.  Only counts failures within the
+     * timeout, and in successful tasksets.
+     */
+    def numUniqueTaskFailures: Int = failuresAndExpiryTimes.size
+
+    def isEmpty: Boolean = failuresAndExpiryTimes.isEmpty
+
+    /**
+     * Apply the timeout to individual tasks.  This is to prevent one-off failures that are very
+     * spread out in time (and likely have nothing to do with problems on the executor) from
+     * triggering blacklisting.  However, note that we do *not* remove executors and nodes from
+     * the blacklist as we expire individual task failures -- each have their own timeout.  Eg.,
+     * suppose:
+     *  * timeout = 10, maxFailuresPerExec = 2
+     *  * Task 1 fails on exec 1 at time 0
+     *  * Task 2 fails on exec 1 at time 5
+     * -->  exec 1 is blacklisted from time 5 - 15.
+     * This is to simplify the implementation, as well as keep the behavior easier to understand
+     * for the end user.
+     */
+    def dropFailuresWithTimeoutBefore(dropBefore: Long): Unit = {
+      if (minExpiryTime < dropBefore) {
+        var newMinExpiry = Long.MaxValue
+        val newFailures = new ArrayBuffer[(TaskId, Long)]
+        failuresAndExpiryTimes.foreach { case (task, expiryTime) =>
+          if (expiryTime >= dropBefore) {
+            newFailures += ((task, expiryTime))
+            if (expiryTime < newMinExpiry) {
+              newMinExpiry = expiryTime
+            }
+          }
+        }
+        failuresAndExpiryTimes = newFailures
+        minExpiryTime = newMinExpiry
+      }
+    }
+
+    override def toString(): String = {
+      s"failures = $failuresAndExpiryTimes"
+    }
+  }
+
+}
 
 private[scheduler] object BlacklistTracker extends Logging {
 
@@ -80,7 +344,9 @@ private[scheduler] object BlacklistTracker extends Logging {
       config.MAX_TASK_ATTEMPTS_PER_EXECUTOR,
       config.MAX_TASK_ATTEMPTS_PER_NODE,
       config.MAX_FAILURES_PER_EXEC_STAGE,
-      config.MAX_FAILED_EXEC_PER_NODE_STAGE
+      config.MAX_FAILED_EXEC_PER_NODE_STAGE,
+      config.MAX_FAILURES_PER_EXEC,
+      config.MAX_FAILED_EXEC_PER_NODE
     ).foreach { config =>
       val v = conf.get(config)
       if (v <= 0) {
@@ -112,3 +378,5 @@ private[scheduler] object BlacklistTracker extends Logging {
     }
   }
 }
+
+private final case class BlacklistedExecutor(node: String, expiryTime: Long)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorFailuresInTaskSet.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorFailuresInTaskSet.scala
@@ -25,26 +25,30 @@ import scala.collection.mutable.HashMap
 private[scheduler] class ExecutorFailuresInTaskSet(val node: String) {
   /**
    * Mapping from index of the tasks in the taskset, to the number of times it has failed on this
-   * executor.
+   * executor and the most recent failure time.
    */
-  val taskToFailureCount = HashMap[Int, Int]()
+  val taskToFailureCountAndFailureTime = HashMap[Int, (Int, Long)]()
 
-  def updateWithFailure(taskIndex: Int): Unit = {
-    val prevFailureCount = taskToFailureCount.getOrElse(taskIndex, 0)
-    taskToFailureCount(taskIndex) = prevFailureCount + 1
+  def updateWithFailure(taskIndex: Int, failureTime: Long): Unit = {
+    val (prevFailureCount, prevFailureTime) =
+      taskToFailureCountAndFailureTime.getOrElse(taskIndex, (0, -1L))
+    // these times always come from the driver, so we don't need to worry about skew, but might
+    // as well still be defensive in case there is non-monotonicity in the clock
+    val newFailureTime = math.max(prevFailureTime, failureTime)
+    taskToFailureCountAndFailureTime(taskIndex) = (prevFailureCount + 1, newFailureTime)
   }
 
-  def numUniqueTasksWithFailures: Int = taskToFailureCount.size
+  def numUniqueTasksWithFailures: Int = taskToFailureCountAndFailureTime.size
 
   /**
    * Return the number of times this executor has failed on the given task index.
    */
   def getNumTaskFailures(index: Int): Int = {
-    taskToFailureCount.getOrElse(index, 0)
+    taskToFailureCountAndFailureTime.getOrElse(index, (0, 0))._1
   }
 
   override def toString(): String = {
     s"numUniqueTasksWithFailures = $numUniqueTasksWithFailures; " +
-      s"tasksToFailureCount = $taskToFailureCount"
+      s"tasksToFailureCount = $taskToFailureCountAndFailureTime"
   }
 }