From 1f40cec8ec12c7f325a5dc785a26e83f2c5534e6 Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Tue, 2 Aug 2016 17:53:26 -0700 Subject: [PATCH] orchestrator: Avoid restarting a task that has already been restarted When I was debugging integration test failures a week ago, I dug deep into the recent restart supervisor changes, and noticed a possible race which is almost impossible in practice. This is the sequence of events that has to happen: 1) Restart task A -> new task B. 2) B fails before the restart delay has elapsed (pull failure): Restart(B) starts waiter goroutine. 3) B's node fails at exact moment after the restart delay finishes. The orchestrator calls Restart(B) again, this time there's no delay in progress so Restart proceeds to set B.DesiredState = SHUTDOWN and create a new task. 4) The waiter goroutine from step 2 is scheduled. It calls Restart(B) to resume the first restart attempt. This sets B.DesiredState = SHUTDOWN (which means no change) and creates a new task. We'd end up with an extra task. Again, this would be almost impossible to trigger, but I'm fixing it for the sake of correctness. The general principle here is that Restart should never been called on a task that already has DesiredState > RUNNING, since what Restart does is "shut down this task and replace it with a new one". I also added a sanity check to Restart. I'm not sure this is really helpful because returning an error probably does more harm than good in this case. But at least it would cause an error message to be logged. Signed-off-by: Aaron Lehmann --- manager/orchestrator/restart.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/manager/orchestrator/restart.go b/manager/orchestrator/restart.go index a727d91a77..bddc798c27 100644 --- a/manager/orchestrator/restart.go +++ b/manager/orchestrator/restart.go @@ -2,6 +2,7 @@ package orchestrator import ( "container/list" + "errors" "sync" "time" @@ -76,6 +77,9 @@ func (r *RestartSupervisor) waitRestart(ctx context.Context, oldDelay *delayedSt if t == nil { return nil } + if t.DesiredState > api.TaskStateRunning { + return nil + } service := store.GetService(tx, t.ServiceID) if service == nil { return nil @@ -108,6 +112,13 @@ func (r *RestartSupervisor) Restart(ctx context.Context, tx store.Tx, cluster *a } r.mu.Unlock() + // Sanity check: was the task shut down already by a separate call to + // Restart? If so, we must avoid restarting it, because this will create + // an extra task. This should never happen unless there is a bug. + if t.DesiredState > api.TaskStateRunning { + return errors.New("Restart called on task that was already shut down") + } + t.DesiredState = api.TaskStateShutdown err := store.UpdateTask(tx, &t) if err != nil {