diff --git a/.changelog/11945.txt b/.changelog/11945.txt new file mode 100644 index 00000000000..71882cff0f2 --- /dev/null +++ b/.changelog/11945.txt @@ -0,0 +1,3 @@ +```release-note:bug +Fixed a bug where successful poststart tasks were marked as unhealthy +``` diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index cc8f79a225e..7c17b7e364d 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() { continue } + // If this is a poststart task which has already succeeded, we + // should skip evaluation. + if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() { + continue + } + // One of the tasks has failed so we can exit watching if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) { t.setTaskHealth(false, true) diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 9276d77265b..f4aec166d9e 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) { } } +func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) { + t.Parallel() + + alloc := mock.LifecycleAllocWithPoststartDeploy() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1 + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + "web": { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + "post": { + State: structs.TaskStateDead, + StartedAt: time.Now(), + FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2), + }, + } + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + consul := consul.NewMockConsulServiceClient(t, logger) + ctx, cancelFn := context.WithCancel(context.Background()) + defer cancelFn() + + checkInterval := 10 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true) + tracker.checkLookupInterval = checkInterval + tracker.Start() + + select { + case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2): + require.Fail(t, "timed out while waiting for health") + case h := <-tracker.HealthyCh(): + require.True(t, h) + } +} + func TestTracker_Checks_Unhealthy(t *testing.T) { t.Parallel() diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 95886654624..343545ef87e 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -683,6 +683,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job { return job } +func LifecycleJobWithPoststartDeploy() *structs.Job { + job := &structs.Job{ + Region: "global", + ID: fmt.Sprintf("mock-service-%s", uuid.Generate()), + Name: "my-job", + Namespace: structs.DefaultNamespace, + Type: structs.JobTypeBatch, + Priority: 50, + AllAtOnce: false, + Datacenters: []string{"dc1"}, + Constraints: []*structs.Constraint{ + { + LTarget: "${attr.kernel.name}", + RTarget: "linux", + Operand: "=", + }, + }, + TaskGroups: []*structs.TaskGroup{ + { + Name: "web", + Count: 1, + Migrate: structs.DefaultMigrateStrategy(), + RestartPolicy: &structs.RestartPolicy{ + Attempts: 0, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + Mode: structs.RestartPolicyModeFail, + }, + Tasks: []*structs.Task{ + { + Name: "web", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "side", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPrestart, + Sidecar: true, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "post", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPoststart, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "init", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPrestart, + Sidecar: false, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + }, + }, + }, + Meta: map[string]string{ + "owner": "armon", + }, + Status: structs.JobStatusPending, + Version: 0, + CreateIndex: 42, + ModifyIndex: 99, + JobModifyIndex: 99, + } + job.Canonicalize() + return job +} + func LifecycleAllocWithPoststopDeploy() *structs.Allocation { alloc := &structs.Allocation{ ID: uuid.Generate(), @@ -759,6 +863,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation { return alloc } +func LifecycleAllocWithPoststartDeploy() *structs.Allocation { + alloc := &structs.Allocation{ + ID: uuid.Generate(), + EvalID: uuid.Generate(), + NodeID: "12345678-abcd-efab-cdef-123456789xyz", + Namespace: structs.DefaultNamespace, + TaskGroup: "web", + + // TODO Remove once clientv2 gets merged + Resources: &structs.Resources{ + CPU: 500, + MemoryMB: 256, + }, + TaskResources: map[string]*structs.Resources{ + "web": { + CPU: 1000, + MemoryMB: 256, + }, + "init": { + CPU: 1000, + MemoryMB: 256, + }, + "side": { + CPU: 1000, + MemoryMB: 256, + }, + "post": { + CPU: 1000, + MemoryMB: 256, + }, + }, + + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "init": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "side": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "post": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + }, + }, + Job: LifecycleJobWithPoststartDeploy(), + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusPending, + } + alloc.JobID = alloc.Job.ID + return alloc +} + func MaxParallelJob() *structs.Job { update := *structs.DefaultUpdateStrategy update.MaxParallel = 0 diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 2c20932a24f..ac061ab0997 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState { return newTS } -// Successful returns whether a task finished successfully. This doesn't really -// have meaning on a non-batch allocation because a service and system -// allocation should not finish. +// Successful returns whether a task finished successfully. Only meaningful for +// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a +// service or system allocation. func (ts *TaskState) Successful() bool { return ts.State == TaskStateDead && !ts.Failed }