Skip to content

Commit

Permalink
Merge pull request #11878 from kainoaseto/fix/multi-task-group-canary…
Browse files Browse the repository at this point in the history
…-deploys

Bugfix: auto-promote canary taskgroups when mixed with non-canary taskgroups
  • Loading branch information
schmichael authored Feb 1, 2022
2 parents 70e5b4e + 09cc586 commit 0970dd2
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 22 deletions.
3 changes: 3 additions & 0 deletions .changelog/11878.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
core: Fixed auto-promotion of canaries in jobs with at least one task group without canaries.
```
9 changes: 8 additions & 1 deletion nomad/deploymentwatcher/deployment_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,16 @@ func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStu
return nil
}

// AutoPromote iff every task group is marked auto_promote and is healthy. The whole
// AutoPromote iff every task group with canaries is marked auto_promote and is healthy. The whole
// job version has been incremented, so we promote together. See also AutoRevert
for _, dstate := range d.TaskGroups {

// skip auto promote canary validation if the task group has no canaries
// to prevent auto promote hanging on mixed canary/non-canary taskgroup deploys
if dstate.DesiredCanaries < 1 {
continue
}

if !dstate.AutoPromote || dstate.DesiredCanaries != len(dstate.PlacedCanaries) {
return nil
}
Expand Down
74 changes: 53 additions & 21 deletions nomad/deploymentwatcher/deployments_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -535,30 +535,40 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
w, m := defaultTestDeploymentWatcher(t)
now := time.Now()

// Create 1 UpdateStrategy, 1 job (1 TaskGroup), 2 canaries, and 1 deployment
upd := structs.DefaultUpdateStrategy.Copy()
upd.AutoPromote = true
upd.MaxParallel = 2
upd.Canary = 2
upd.ProgressDeadline = 5 * time.Second
// Create 1 UpdateStrategy, 1 job (2 TaskGroups), 2 canaries, and 1 deployment
canaryUpd := structs.DefaultUpdateStrategy.Copy()
canaryUpd.AutoPromote = true
canaryUpd.MaxParallel = 2
canaryUpd.Canary = 2
canaryUpd.ProgressDeadline = 5 * time.Second

j := mock.Job()
j.TaskGroups[0].Update = upd
rollingUpd := structs.DefaultUpdateStrategy.Copy()
rollingUpd.ProgressDeadline = 5 * time.Second

j := mock.MultiTaskGroupJob()
j.TaskGroups[0].Update = canaryUpd
j.TaskGroups[1].Update = rollingUpd

d := mock.Deployment()
d.JobID = j.ID
// This is created in scheduler.computeGroup at runtime, where properties from the
// UpdateStrategy are copied in
d.TaskGroups = map[string]*structs.DeploymentState{
"web": {
AutoPromote: upd.AutoPromote,
AutoRevert: upd.AutoRevert,
ProgressDeadline: upd.ProgressDeadline,
AutoPromote: canaryUpd.AutoPromote,
AutoRevert: canaryUpd.AutoRevert,
ProgressDeadline: canaryUpd.ProgressDeadline,
DesiredTotal: 2,
},
"api": {
AutoPromote: rollingUpd.AutoPromote,
AutoRevert: rollingUpd.AutoRevert,
ProgressDeadline: rollingUpd.ProgressDeadline,
DesiredTotal: 2,
},
}

alloc := func() *structs.Allocation {
canaryAlloc := func() *structs.Allocation {
a := mock.Alloc()
a.DeploymentID = d.ID
a.CreateTime = now.UnixNano()
Expand All @@ -569,14 +579,36 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
return a
}

a := alloc()
b := alloc()
rollingAlloc := func() *structs.Allocation {
a := mock.Alloc()
a.DeploymentID = d.ID
a.CreateTime = now.UnixNano()
a.ModifyTime = now.UnixNano()
a.TaskGroup = "api"
a.AllocatedResources.Tasks["api"] = a.AllocatedResources.Tasks["web"].Copy()
delete(a.AllocatedResources.Tasks, "web")
a.TaskResources["api"] = a.TaskResources["web"].Copy()
delete(a.TaskResources, "web")
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Canary: false,
}
return a
}

d.TaskGroups[a.TaskGroup].PlacedCanaries = []string{a.ID, b.ID}
d.TaskGroups[a.TaskGroup].DesiredCanaries = 2
// Web taskgroup (0)
ca1 := canaryAlloc()
ca2 := canaryAlloc()

// Api taskgroup (1)
ra1 := rollingAlloc()
ra2 := rollingAlloc()

d.TaskGroups[ca1.TaskGroup].PlacedCanaries = []string{ca1.ID, ca2.ID}
d.TaskGroups[ca1.TaskGroup].DesiredCanaries = 2
d.TaskGroups[ra1.TaskGroup].PlacedAllocs = 2
require.NoError(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), j), "UpsertJob")
require.NoError(t, m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b}), "UpsertAllocs")
require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{ca1, ca2, ra1, ra2}), "UpsertAllocs")

// =============================================================
// Support method calls
Expand All @@ -595,7 +627,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {

matchConfig1 := &matchDeploymentAllocHealthRequestConfig{
DeploymentID: d.ID,
Healthy: []string{a.ID, b.ID},
Healthy: []string{ca1.ID, ca2.ID, ra1.ID, ra2.ID},
Eval: true,
}
matcher1 := matchDeploymentAllocHealthRequest(matchConfig1)
Expand Down Expand Up @@ -629,7 +661,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
// Mark the canaries healthy
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
HealthyAllocationIDs: []string{a.ID, b.ID},
HealthyAllocationIDs: []string{ca1.ID, ca2.ID, ra1.ID, ra2.ID},
}
var resp structs.DeploymentUpdateResponse
// Calls w.raft.UpdateDeploymentAllocHealth, which is implemented by StateStore in
Expand All @@ -654,12 +686,12 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
require.Equal(t, "running", d.Status)
require.True(t, d.TaskGroups["web"].Promoted)

a1, _ := m.state.AllocByID(ws, a.ID)
a1, _ := m.state.AllocByID(ws, ca1.ID)
require.False(t, a1.DeploymentStatus.Canary)
require.Equal(t, "pending", a1.ClientStatus)
require.Equal(t, "run", a1.DesiredStatus)

b1, _ := m.state.AllocByID(ws, b.ID)
b1, _ := m.state.AllocByID(ws, ca2.ID)
require.False(t, b1.DeploymentStatus.Canary)
}

Expand Down
82 changes: 82 additions & 0 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,88 @@ func Job() *structs.Job {
return job
}

func MultiTaskGroupJob() *structs.Job {
job := Job()
apiTaskGroup := &structs.TaskGroup{
Name: "api",
Count: 10,
EphemeralDisk: &structs.EphemeralDisk{
SizeMB: 150,
},
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeDelay,
},
ReschedulePolicy: &structs.ReschedulePolicy{
Attempts: 2,
Interval: 10 * time.Minute,
Delay: 5 * time.Second,
DelayFunction: "constant",
},
Migrate: structs.DefaultMigrateStrategy(),
Networks: []*structs.NetworkResource{
{
Mode: "host",
DynamicPorts: []structs.Port{
{Label: "http"},
{Label: "admin"},
},
},
},
Tasks: []*structs.Task{
{
Name: "api",
Driver: "exec",
Config: map[string]interface{}{
"command": "/bin/date",
},
Env: map[string]string{
"FOO": "bar",
},
Services: []*structs.Service{
{
Name: "${TASK}-backend",
PortLabel: "http",
Tags: []string{"pci:${meta.pci-dss}", "datacenter:${node.datacenter}"},
Checks: []*structs.ServiceCheck{
{
Name: "check-table",
Type: structs.ServiceCheckScript,
Command: "/usr/local/check-table-${meta.database}",
Args: []string{"${meta.version}"},
Interval: 30 * time.Second,
Timeout: 5 * time.Second,
},
},
},
{
Name: "${TASK}-admin",
PortLabel: "admin",
},
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 500,
MemoryMB: 256,
},
Meta: map[string]string{
"foo": "bar",
},
},
},
Meta: map[string]string{
"elb_check_type": "http",
"elb_check_interval": "30s",
"elb_check_min": "3",
},
}
job.TaskGroups = append(job.TaskGroups, apiTaskGroup)
job.Canonicalize()
return job
}

func LifecycleSideTask(resources structs.Resources, i int) *structs.Task {
return &structs.Task{
Name: fmt.Sprintf("side-%d", i),
Expand Down

0 comments on commit 0970dd2

Please sign in to comment.