Skip to content

Commit

Permalink
Merge pull request #1205 from hashicorp/b-drain-restarting-jobs
Browse files Browse the repository at this point in the history
Do not restart successful batch jobs when the node is tainted (removed/drained)
  • Loading branch information
dadgar committed May 25, 2016
2 parents ecd0e1e + d5fa76e commit a113f31
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 5 deletions.
32 changes: 32 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1904,6 +1904,21 @@ func (ts *TaskState) Failed() bool {
}
}

// Successful returns whether a task finished successfully.
func (ts *TaskState) Successful() bool {
l := len(ts.Events)
if ts.State != TaskStateDead || l == 0 {
return false
}

e := ts.Events[l-1]
if e.Type != TaskTerminated {
return false
}

return e.ExitCode == 0
}

const (
// TaskDriveFailure indicates that the task could not be started due to a
// failure in the driver.
Expand Down Expand Up @@ -2336,6 +2351,23 @@ func (a *Allocation) TerminalStatus() bool {
}
}

// RanSuccessfully returns whether the client has ran the allocation and all
// tasks finished successfully
func (a *Allocation) RanSuccessfully() bool {
// Handle the case the client hasn't started the allocation.
if len(a.TaskStates) == 0 {
return false
}

// Check to see if all the tasks finised successfully in the allocation
allSuccess := true
for _, state := range a.TaskStates {
allSuccess = allSuccess && state.Successful()
}

return allSuccess
}

// Stub returns a list stub for the allocation
func (a *Allocation) Stub() *AllocListStub {
return &AllocListStub{
Expand Down
8 changes: 4 additions & 4 deletions scheduler/generic_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,12 @@ func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []
filter := func(a *structs.Allocation) bool {
if s.batch {
// Allocs from batch jobs should be filtered when the desired status
// is terminal or when the client status is failed so that they will
// be replaced. If they are complete but not failed, they shouldn't
// be replaced.
// is terminal and the client did not finish or when the client
// status is failed so that they will be replaced. If they are
// complete but not failed, they shouldn't be replaced.
switch a.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
return true
return !a.RanSuccessfully()
default:
}

Expand Down
68 changes: 68 additions & 0 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1316,3 +1316,71 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_ReRun_SuccessfullyFinishedAlloc(t *testing.T) {
h := NewHarness(t)

// Create two nodes, one that is drained and has a successfully finished
// alloc and a fresh undrained one
node := mock.Node()
node.Drain = true
node2 := mock.Node()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))

// Create a job
job := mock.Job()
job.Type = structs.JobTypeBatch
job.TaskGroups[0].Count = 1
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Create a successful alloc
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.ClientStatus = structs.AllocClientStatusComplete
alloc.TaskStates = map[string]*structs.TaskState{
"web": &structs.TaskState{
State: structs.TaskStateDead,
Events: []*structs.TaskEvent{
{
Type: structs.TaskTerminated,
ExitCode: 0,
},
},
},
}
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))

// Create a mock evaluation to rerun the job
eval := &structs.Evaluation{
ID: structs.GenerateUUID(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
}

// Process the evaluation
err := h.Process(NewBatchScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure no plan
if len(h.Plans) != 0 {
t.Fatalf("bad: %#v", h.Plans)
}

// Lookup the allocations by JobID
out, err := h.State.AllocsByJob(job.ID)
noErr(t, err)

// Ensure no replacement alloc was placed.
if len(out) != 1 {
t.Fatalf("bad: %#v", out)
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
12 changes: 11 additions & 1 deletion scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,17 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
continue
}

// If we are on a tainted node, we must migrate
// If we are on a tainted node, we must migrate if we are a service or
// if the batch allocation did not finish
if taintedNodes[exist.NodeID] {
// If the job is batch and finished succesfully, the fact that the
// node is tainted does not mean it should be migrated as the work
// was already succesfully finished. However for service/system
// jobs, tasks should never complete. The check of batch type,
// defends against client bugs.
if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
goto IGNORE
}
result.migrate = append(result.migrate, allocTuple{
Name: name,
TaskGroup: tg,
Expand All @@ -102,6 +111,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
}

// Everything is up-to-date
IGNORE:
result.ignore = append(result.ignore, allocTuple{
Name: name,
TaskGroup: tg,
Expand Down
3 changes: 3 additions & 0 deletions scheduler/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,15 @@ func TestDiffAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "zip",
Name: "my-job.web[10]",
Job: oldJob,
},

// Migrate the 3rd
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[2]",
Job: oldJob,
},
}

Expand Down Expand Up @@ -155,6 +157,7 @@ func TestDiffSystemAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[0]",
Job: oldJob,
},
}

Expand Down

0 comments on commit a113f31

Please sign in to comment.