This repository has been archived by the owner on Oct 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 841
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Don't destroy persistent volumes when killing unreachable resident tasks
Summary: Killing an unreachable resident task will do nothing, rather than destroy the reservations. Fixes #5207 Also-By: tharper@mesosphere.com Test Plan: sbt test Reviewers: unterstein, meichstedt, jasongilanfarr, jenkins Reviewed By: meichstedt, jasongilanfarr, jenkins Subscribers: jdef, marathon-team Differential Revision: https://phabricator.mesosphere.com/D529
- Loading branch information
1 parent
62bb2c7
commit 5f34abd
Showing
6 changed files
with
174 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
src/main/scala/mesosphere/marathon/core/task/termination/impl/KillAction.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
package mesosphere.marathon | ||
package core.task.termination.impl | ||
|
||
import com.typesafe.scalalogging.StrictLogging | ||
import mesosphere.marathon.core.condition.Condition | ||
import mesosphere.marathon.core.instance.Instance | ||
import mesosphere.marathon.core.task.Task | ||
|
||
/** | ||
* Possible actions that can be chosen in order to `kill` a given instance. | ||
* Depending on the instance's state this can be one of | ||
* - [[KillAction.ExpungeFromState]] | ||
* - [[KillAction.Noop]] | ||
* - [[KillAction.IssueKillRequest]] | ||
*/ | ||
private[termination] sealed trait KillAction | ||
|
||
private[termination] object KillAction extends StrictLogging { | ||
/** | ||
* Any normal, reachable and stateless instance will simply be killed via the scheduler driver. | ||
*/ | ||
case object IssueKillRequest extends KillAction | ||
|
||
/** | ||
* Do nothing. This is currently what we do for unreachable tasks with reservations. See #5261 | ||
*/ | ||
case object Noop extends KillAction | ||
|
||
/** | ||
* In case of an instance being Unreachable, killing the related Mesos task is impossible. | ||
* In order to get rid of the instance, processing this action expunges the metadata from | ||
* state. If the instance is reported to be non-terminal in the future, it will be killed. | ||
*/ | ||
case object ExpungeFromState extends KillAction | ||
|
||
/* returns whether or not we can expect the task to report a terminal state after sending a kill signal */ | ||
private val wontRespondToKill: Condition => Boolean = { | ||
import Condition._ | ||
Set( | ||
Unknown, Unreachable, UnreachableInactive, | ||
// TODO: it should be safe to remove these from this list, because | ||
// 1) all taskId's should be removed at this point, because Gone & Dropped are terminal. | ||
// 2) Killing a Gone / Dropped task will cause it to be in a terminal state. | ||
// 3) Killing a Gone / Dropped task may result in no status change at all. | ||
// 4) Either way, we end up in a terminal state. | ||
// However, we didn't want to risk changing behavior in a point release. So they remain here. | ||
Dropped, Gone | ||
) | ||
} | ||
|
||
/** | ||
* Computes the [[KillAction]] based on the instance's state. | ||
* | ||
* if the instance can't be reached, issuing a kill request won't cause the instance to progress towards a terminal | ||
* state; Mesos will simply re-send the current state. Our current behavior, for ephemeral, is to simply delete any | ||
* knowledge that the instance might be running, such that if it is reported by Mesos later we will kill it. (that | ||
* could be improved). | ||
* | ||
* If the instance is lost _and_ has reservations, we do nothing. | ||
* | ||
* any other case -> issue a kill request | ||
*/ | ||
def apply(instanceId: Instance.Id, taskIds: Iterable[Task.Id], knownInstance: Option[Instance]): KillAction = { | ||
val hasReservations = knownInstance.fold(false)(_.hasReservation) | ||
|
||
// TODO(PODS): align this with other Terminal/Unreachable/whatever extractors | ||
val maybeCondition = knownInstance.map(_.state.condition) | ||
val isUnkillable = maybeCondition.fold(false)(wontRespondToKill) | ||
|
||
// Ephemeral instances are expunged once all tasks are terminal, it's unlikely for this to be true for them. | ||
// Resident tasks, however, could be in this state if scaled down, or, if kill is attempted between recovery. | ||
val allTerminal: Boolean = taskIds.isEmpty | ||
|
||
if (isUnkillable || allTerminal) { | ||
val msg = if (isUnkillable) | ||
s"it is ${maybeCondition.fold("unknown")(_.toString)}" | ||
else | ||
"none of its tasks are running" | ||
if (hasReservations) { | ||
logger.info( | ||
s"Ignoring kill request for ${instanceId}; killing it while ${msg} is unsupported") | ||
KillAction.Noop | ||
} else { | ||
logger.warn(s"Expunging ${instanceId} from state because ${msg}") | ||
// we will eventually be notified of a taskStatusUpdate after the instance has been expunged | ||
KillAction.ExpungeFromState | ||
} | ||
} else { | ||
val knownOrNot = if (knownInstance.isDefined) "known" else "unknown" | ||
logger.warn("Killing {} {} of instance {}", knownOrNot, taskIds.mkString(","), instanceId) | ||
KillAction.IssueKillRequest | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
src/test/scala/mesosphere/marathon/core/task/termination/impl/KillActionTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package mesosphere.marathon | ||
package core.task.termination.impl | ||
|
||
import mesosphere.UnitTest | ||
import mesosphere.marathon.core.base.ConstantClock | ||
import mesosphere.marathon.core.instance.{ Instance, TestInstanceBuilder } | ||
import mesosphere.marathon.core.task.Task.LocalVolumeId | ||
import mesosphere.marathon.state.PathId | ||
import org.scalatest.prop.TableDrivenPropertyChecks | ||
|
||
class KillActionTest extends UnitTest with TableDrivenPropertyChecks { | ||
|
||
val clock = ConstantClock() | ||
val appId = PathId("/test") | ||
|
||
lazy val localVolumeId = LocalVolumeId(appId, "unwanted-persistent-volume", "uuid1") | ||
lazy val residentLaunchedInstance: Instance = TestInstanceBuilder.newBuilder(appId). | ||
addTaskResidentLaunched(localVolumeId). | ||
getInstance() | ||
|
||
lazy val residentUnreachableInstance: Instance = TestInstanceBuilder.newBuilder(appId). | ||
addTaskWithBuilder(). | ||
taskResidentUnreachable(localVolumeId). | ||
build(). | ||
getInstance() | ||
|
||
lazy val unreachableInstance: Instance = TestInstanceBuilder.newBuilder(appId).addTaskUnreachable().getInstance() | ||
lazy val runningInstance: Instance = TestInstanceBuilder.newBuilder(appId).addTaskLaunched().getInstance() | ||
|
||
"computeKillAction" when { | ||
Table( | ||
("name", "instance", "expected"), | ||
("an unreachable reserved instance", residentUnreachableInstance, KillAction.Noop), | ||
("a running reserved instance", residentLaunchedInstance, KillAction.IssueKillRequest), | ||
("an unreachable ephemeral instance", unreachableInstance, KillAction.ExpungeFromState), | ||
("a running ephemeral instance", runningInstance, KillAction.IssueKillRequest) | ||
). | ||
foreach { | ||
case (name, instance, expected) => | ||
s"killing ${name}" should { | ||
s"result in ${expected}" in { | ||
KillAction( | ||
instance.instanceId, instance.tasksMap.keys, Some(instance)). | ||
shouldBe(expected) | ||
} | ||
} | ||
} | ||
} | ||
} |