diff --git a/CHANGELOG.md b/CHANGELOG.md index 95464c9cca0..fa9fe2c3fab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ IMPROVEMENTS: * core: Add node name to output of `nomad node status` command in verbose mode [[GH-5224](https://github.com/hashicorp/nomad/pull/5224)] * core: Reduce the size of the raft transaction for plans by only sending fields updated by the plan applier [[GH-5602](https://github.com/hashicorp/nomad/pull/5602)] * api: Support configuring `http.Client` used by golang `api` package [[GH-5275](https://github.com/hashicorp/nomad/pull/5275)] + * core: Add job update `auto_promote` flag, which causes deployments to promote themselves when all canaries become healthy [[GH-5719](https://github.com/hashicorp/nomad/pull/5719)] * api: Add preemption related fields to API results that return an allocation list. [[GH-5580](https://github.com/hashicorp/nomad/pull/5580)] * api: Add additional config options to scheduler configuration endpoint to disable preemption [[GH-5628](https://github.com/hashicorp/nomad/issues/5628)] * client: Reduce unnecessary lost nodes on server failure [[GH-5654](https://github.com/hashicorp/nomad/issues/5654)] diff --git a/api/jobs.go b/api/jobs.go index 973250f3034..4f73da10c6e 100644 --- a/api/jobs.go +++ b/api/jobs.go @@ -375,13 +375,15 @@ type UpdateStrategy struct { MinHealthyTime *time.Duration `mapstructure:"min_healthy_time"` HealthyDeadline *time.Duration `mapstructure:"healthy_deadline"` ProgressDeadline *time.Duration `mapstructure:"progress_deadline"` - AutoRevert *bool `mapstructure:"auto_revert"` Canary *int `mapstructure:"canary"` + AutoRevert *bool `mapstructure:"auto_revert"` + AutoPromote *bool `mapstructure:"auto_promote"` } // DefaultUpdateStrategy provides a baseline that can be used to upgrade // jobs with the old policy or for populating field defaults. func DefaultUpdateStrategy() *UpdateStrategy { + // boolPtr fields are omitted to avoid masking an unconfigured nil return &UpdateStrategy{ Stagger: timeToPtr(30 * time.Second), MaxParallel: intToPtr(1), @@ -433,6 +435,10 @@ func (u *UpdateStrategy) Copy() *UpdateStrategy { copy.Canary = intToPtr(*u.Canary) } + if u.AutoPromote != nil { + copy.AutoPromote = boolToPtr(*u.AutoPromote) + } + return copy } @@ -472,11 +478,17 @@ func (u *UpdateStrategy) Merge(o *UpdateStrategy) { if o.Canary != nil { u.Canary = intToPtr(*o.Canary) } + + if o.AutoPromote != nil { + u.AutoPromote = boolToPtr(*o.AutoPromote) + } } func (u *UpdateStrategy) Canonicalize() { d := DefaultUpdateStrategy() + // boolPtr fields are omitted to avoid masking an unconfigured nil + if u.MaxParallel == nil { u.MaxParallel = d.MaxParallel } diff --git a/api/jobs_test.go b/api/jobs_test.go index e131d49dc37..d55351214f0 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -323,6 +323,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(10 * time.Minute), AutoRevert: boolToPtr(false), Canary: intToPtr(0), + AutoPromote: nil, }, TaskGroups: []*TaskGroup{ { @@ -357,6 +358,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(10 * time.Minute), AutoRevert: boolToPtr(false), Canary: intToPtr(0), + AutoPromote: nil, }, Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ @@ -486,6 +488,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(7 * time.Minute), AutoRevert: boolToPtr(false), Canary: intToPtr(0), + AutoPromote: boolToPtr(false), }, TaskGroups: []*TaskGroup{ { @@ -497,6 +500,7 @@ func TestJobs_Canonicalize(t *testing.T) { MinHealthyTime: timeToPtr(1 * time.Second), AutoRevert: boolToPtr(true), Canary: intToPtr(1), + AutoPromote: boolToPtr(true), }, Tasks: []*Task{ { @@ -541,6 +545,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(7 * time.Minute), AutoRevert: boolToPtr(false), Canary: intToPtr(0), + AutoPromote: boolToPtr(false), }, TaskGroups: []*TaskGroup{ { @@ -574,6 +579,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(7 * time.Minute), AutoRevert: boolToPtr(true), Canary: intToPtr(1), + AutoPromote: boolToPtr(true), }, Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ @@ -616,6 +622,7 @@ func TestJobs_Canonicalize(t *testing.T) { ProgressDeadline: timeToPtr(7 * time.Minute), AutoRevert: boolToPtr(false), Canary: intToPtr(0), + AutoPromote: boolToPtr(false), }, Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ diff --git a/api/tasks.go b/api/tasks.go index 747711baa3e..6932876734d 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -504,6 +504,7 @@ func NewTaskGroup(name string, count int) *TaskGroup { } } +// Canonicalize sets defaults and merges settings that should be inherited from the job func (g *TaskGroup) Canonicalize(job *Job) { if g.Name == nil { g.Name = stringToPtr("") diff --git a/api/tasks_test.go b/api/tasks_test.go index 8ebea05859d..3ee01607371 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -374,6 +374,7 @@ func TestTaskGroup_Canonicalize_Update(t *testing.T) { ID: stringToPtr("test"), Update: &UpdateStrategy{ AutoRevert: boolToPtr(false), + AutoPromote: boolToPtr(false), Canary: intToPtr(0), HealthCheck: stringToPtr(""), HealthyDeadline: timeToPtr(0), diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index c2944023434..d17bb91fdee 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -611,7 +611,9 @@ func ApiJobToStructJob(job *api.Job) *structs.Job { Affinities: ApiAffinitiesToStructs(job.Affinities), } - // COMPAT: Remove in 0.7.0. Update has been pushed into the task groups + // Update has been pushed into the task groups. stagger and max_parallel are + // preserved at the job level, but all other values are discarded. The job.Update + // api value is merged into TaskGroups already in api.Canonicalize if job.Update != nil { j.Update = structs.UpdateStrategy{} @@ -718,9 +720,17 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) { MinHealthyTime: *taskGroup.Update.MinHealthyTime, HealthyDeadline: *taskGroup.Update.HealthyDeadline, ProgressDeadline: *taskGroup.Update.ProgressDeadline, - AutoRevert: *taskGroup.Update.AutoRevert, Canary: *taskGroup.Update.Canary, } + + // boolPtr fields may be nil, others will have pointers to default values via Canonicalize + if taskGroup.Update.AutoRevert != nil { + tg.Update.AutoRevert = *taskGroup.Update.AutoRevert + } + + if taskGroup.Update.AutoPromote != nil { + tg.Update.AutoPromote = *taskGroup.Update.AutoPromote + } } if l := len(taskGroup.Tasks); l != 0 { diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index a076a9b9d82..77329a127bb 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -1624,6 +1624,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { HealthyDeadline: 5 * time.Minute, ProgressDeadline: 5 * time.Minute, AutoRevert: true, + AutoPromote: false, Canary: 1, }, Meta: map[string]string{ @@ -2039,6 +2040,79 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { } } +func TestJobs_ApiJobToStructsJobUpdate(t *testing.T) { + apiJob := &api.Job{ + Update: &api.UpdateStrategy{ + Stagger: helper.TimeToPtr(1 * time.Second), + MaxParallel: helper.IntToPtr(5), + HealthCheck: helper.StringToPtr(structs.UpdateStrategyHealthCheck_Manual), + MinHealthyTime: helper.TimeToPtr(1 * time.Minute), + HealthyDeadline: helper.TimeToPtr(3 * time.Minute), + ProgressDeadline: helper.TimeToPtr(3 * time.Minute), + AutoRevert: helper.BoolToPtr(false), + AutoPromote: nil, + Canary: helper.IntToPtr(1), + }, + TaskGroups: []*api.TaskGroup{ + { + Update: &api.UpdateStrategy{ + Canary: helper.IntToPtr(2), + AutoRevert: helper.BoolToPtr(true), + }, + }, { + Update: &api.UpdateStrategy{ + Canary: helper.IntToPtr(3), + AutoPromote: helper.BoolToPtr(true), + }, + }, + }, + } + + structsJob := ApiJobToStructJob(apiJob) + + // Update has been moved from job down to the groups + jobUpdate := structs.UpdateStrategy{ + Stagger: 1000000000, + MaxParallel: 5, + HealthCheck: "", + MinHealthyTime: 0, + HealthyDeadline: 0, + ProgressDeadline: 0, + AutoRevert: false, + AutoPromote: false, + Canary: 0, + } + + // But the groups inherit settings from the job update + group1 := structs.UpdateStrategy{ + Stagger: 1000000000, + MaxParallel: 5, + HealthCheck: "manual", + MinHealthyTime: 60000000000, + HealthyDeadline: 180000000000, + ProgressDeadline: 180000000000, + AutoRevert: true, + AutoPromote: false, + Canary: 2, + } + + group2 := structs.UpdateStrategy{ + Stagger: 1000000000, + MaxParallel: 5, + HealthCheck: "manual", + MinHealthyTime: 60000000000, + HealthyDeadline: 180000000000, + ProgressDeadline: 180000000000, + AutoRevert: false, + AutoPromote: true, + Canary: 3, + } + + require.Equal(t, jobUpdate, structsJob.Update) + require.Equal(t, group1, *structsJob.TaskGroups[0].Update) + require.Equal(t, group2, *structsJob.TaskGroups[1].Update) +} + // TestHTTP_JobValidate_SystemMigrate asserts that a system job with a migrate // stanza fails to validate but does not panic (see #5477). func TestHTTP_JobValidate_SystemMigrate(t *testing.T) { diff --git a/e2e/README.md b/e2e/README.md index e6438dc7e9d..59c1a0dcbc5 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -15,8 +15,8 @@ You'll need AWS credentials (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`) to Running =========== -After completing the provisioning step above, you should see CLI output showing the IP addresses of Nomad client machines. To run the tests, set the NOMAD_ADDR variable to one of the client IPs. +After completing the provisioning step above, you should see CLI output showing the IP addresses of Nomad client machines. To run the tests, set the NOMAD_ADDR variable to `http://[client IP]:4646/` ``` -$ NOMAD_ADDR=<> $NOMAD_E2E=1 go test -v +$ NOMAD_ADDR=<> NOMAD_E2E=1 go test -v ``` diff --git a/e2e/deployment/deployment.go b/e2e/deployment/deployment.go new file mode 100644 index 00000000000..459bb5e0916 --- /dev/null +++ b/e2e/deployment/deployment.go @@ -0,0 +1,79 @@ +package deployment + +import ( + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" +) + +type DeploymentTest struct { + framework.TC + jobIds []string +} + +func init() { + framework.AddSuites(&framework.TestSuite{ + Component: "Deployment", + CanRunLocal: true, + Cases: []framework.TestCase{ + new(DeploymentTest), + }, + }) +} + +func (tc *DeploymentTest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4) +} + +func (tc *DeploymentTest) TestDeploymentAutoPromote(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + uuid := uuid.Generate() + jobId := "deployment" + uuid[0:8] + tc.jobIds = append(tc.jobIds, jobId) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "deployment/input/deployment_auto0.nomad", jobId) + + // Upgrade + e2eutil.RegisterAllocs(t, nomadClient, "deployment/input/deployment_auto1.nomad", jobId) + var deploy *api.Deployment + ds, _, err := nomadClient.Deployments().List(nil) + require.NoError(t, err) + + // Find the deployment + for _, d := range ds { + if d.JobID == jobId { + deploy = d + break + } + } + + // Deployment is auto pending the upgrade of "two" which has a longer time to health + run := structs.DeploymentStatusRunning + require.Equal(t, run, deploy.Status) + require.Equal(t, structs.DeploymentStatusDescriptionRunningAutoPromotion, deploy.StatusDescription) + + // Deployment is eventually running + e2eutil.WaitForDeployment(t, nomadClient, deploy.ID, run, structs.DeploymentStatusDescriptionRunning) + + deploy, _, _ = nomadClient.Deployments().Info(deploy.ID, nil) + require.Equal(t, run, deploy.Status) + require.Equal(t, structs.DeploymentStatusDescriptionRunning, deploy.StatusDescription) +} + +func (tc *DeploymentTest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + jobs := nomadClient.Jobs() + // Stop all jobs in test + for _, id := range tc.jobIds { + jobs.Deregister(id, true, nil) + } + tc.jobIds = []string{} + // Garbage collect + nomadClient.System().GarbageCollect() +} diff --git a/e2e/deployment/input/deployment_auto0.nomad b/e2e/deployment/input/deployment_auto0.nomad new file mode 100644 index 00000000000..454a818b9df --- /dev/null +++ b/e2e/deployment/input/deployment_auto0.nomad @@ -0,0 +1,54 @@ +job "deployment_auto.nomad" { + datacenters = ["dc1"] + + group "one" { + count = 3 + + update { + max_parallel = 3 + auto_promote = true + canary = 2 + } + + task "one" { + driver = "raw_exec" + + config { + command = "/bin/sleep" + # change args to update the job, the only changes + args = ["1000000"] + } + + resources { + cpu = 20 + memory = 20 + } + } + } + + group "two" { + count = 3 + + update { + max_parallel = 2 + auto_promote = true + canary = 2 + min_healthy_time = "2s" + } + + task "two" { + driver = "raw_exec" + + config { + command = "/bin/sleep" + # change args to update the job, the only changes + args = ["2000000"] + } + + resources { + cpu = 20 + memory = 20 + } + } + } +} diff --git a/e2e/deployment/input/deployment_auto1.nomad b/e2e/deployment/input/deployment_auto1.nomad new file mode 100644 index 00000000000..415e26e3237 --- /dev/null +++ b/e2e/deployment/input/deployment_auto1.nomad @@ -0,0 +1,54 @@ +job "deployment_auto.nomad" { + datacenters = ["dc1"] + + group "one" { + count = 3 + + update { + max_parallel = 3 + auto_promote = true + canary = 2 + } + + task "one" { + driver = "raw_exec" + + config { + command = "/bin/sleep" + # change args to update the job, the only changes + args = ["1000001"] + } + + resources { + cpu = 20 + memory = 20 + } + } + } + + group "two" { + count = 3 + + update { + max_parallel = 2 + auto_promote = true + canary = 2 + min_healthy_time = "2s" + } + + task "two" { + driver = "raw_exec" + + config { + command = "/bin/sleep" + # change args to update the job, the only changes + args = ["2000001"] + } + + resources { + cpu = 20 + memory = 20 + } + } + } +} diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 26483234080..10fde5f5a66 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -8,6 +8,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/clientstate" _ "github.com/hashicorp/nomad/e2e/consul" _ "github.com/hashicorp/nomad/e2e/consultemplate" + _ "github.com/hashicorp/nomad/e2e/deployment" _ "github.com/hashicorp/nomad/e2e/example" _ "github.com/hashicorp/nomad/e2e/nomad09upgrade" _ "github.com/hashicorp/nomad/e2e/nomadexec" diff --git a/e2e/e2eutil/utils.go b/e2e/e2eutil/utils.go index ad17bfa68b4..212282cda82 100644 --- a/e2e/e2eutil/utils.go +++ b/e2e/e2eutil/utils.go @@ -54,15 +54,13 @@ func WaitForNodesReady(t *testing.T, nomadClient *api.Client, nodes int) { }) } -func RegisterAndWaitForAllocs(t *testing.T, nomadClient *api.Client, jobFile string, jobID string) []*api.AllocationListStub { +func RegisterAllocs(t *testing.T, nomadClient *api.Client, jobFile string, jobID string) []*api.AllocationListStub { // Parse job job, err := jobspec.ParseFile(jobFile) require := require.New(t) require.Nil(err) job.ID = helper.StringToPtr(jobID) - g := NewGomegaWithT(t) - // Register job jobs := nomadClient.Jobs() testutil.WaitForResult(func() (bool, error) { @@ -75,15 +73,27 @@ func RegisterAndWaitForAllocs(t *testing.T, nomadClient *api.Client, jobFile str require.NoError(err) }) + allocs, _, _ := jobs.Allocations(jobID, false, nil) + return allocs +} + +func RegisterAndWaitForAllocs(t *testing.T, nomadClient *api.Client, jobFile string, jobID string) []*api.AllocationListStub { + require := require.New(t) + g := NewGomegaWithT(t) + jobs := nomadClient.Jobs() + + // Start allocations + RegisterAllocs(t, nomadClient, jobFile, jobID) + // Wrap in retry to wait until placement g.Eventually(func() []*api.AllocationListStub { // Look for allocations - allocs, _, _ := jobs.Allocations(*job.ID, false, nil) + allocs, _, _ := jobs.Allocations(jobID, false, nil) return allocs }, 30*time.Second, time.Second).ShouldNot(BeEmpty()) - allocs, _, err := jobs.Allocations(*job.ID, false, nil) - require.Nil(err) + allocs, _, err := jobs.Allocations(jobID, false, nil) + require.NoError(err) return allocs } @@ -100,3 +110,26 @@ func WaitForAllocRunning(t *testing.T, nomadClient *api.Client, allocID string) t.Fatalf("failed to wait on alloc: %v", err) }) } + +func WaitForDeployment(t *testing.T, nomadClient *api.Client, deployID string, status string, statusDesc string) { + testutil.WaitForResultRetries(retries, func() (bool, error) { + time.Sleep(time.Millisecond * 100) + deploy, _, err := nomadClient.Deployments().Info(deployID, nil) + if err != nil { + return false, err + } + + if deploy.Status == status && deploy.StatusDescription == statusDesc { + return true, nil + } + return false, fmt.Errorf("expected status %s \"%s\", but got: %s \"%s\"", + deploy.Status, + deploy.StatusDescription, + status, + statusDesc, + ) + + }, func(err error) { + t.Fatalf("failed to wait on deployment: %v", err) + }) +} diff --git a/e2e/terraform/README.md b/e2e/terraform/README.md index 6a2ca4b1312..e41873c3544 100644 --- a/e2e/terraform/README.md +++ b/e2e/terraform/README.md @@ -10,7 +10,7 @@ Use [envchain](https://github.com/sorah/envchain) to store your AWS credentials. ``` $ cd e2e/terraform/ -$ envchain nomadaws TF_VAR_nomad_sha= terraform apply +$ TF_VAR_nomad_sha= envchain nomadaws terraform apply ``` After this step, you should have a nomad client address to point the end to end tests in the `e2e` folder to. diff --git a/jobspec/parse.go b/jobspec/parse.go index 2e281137a20..d881866c12a 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -1594,7 +1594,6 @@ func parseUpdate(result **api.UpdateStrategy, list *ast.ObjectList) error { // Check for invalid keys valid := []string{ - // COMPAT: Remove in 0.7.0. Stagger is deprecated in 0.6.0. "stagger", "max_parallel", "health_check", @@ -1602,6 +1601,7 @@ func parseUpdate(result **api.UpdateStrategy, list *ast.ObjectList) error { "healthy_deadline", "progress_deadline", "auto_revert", + "auto_promote", "canary", } if err := helper.CheckHCLKeys(o.Val, valid); err != nil { diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 1fc84f0f907..6fa29513443 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -79,6 +79,7 @@ func TestParse(t *testing.T) { HealthyDeadline: helper.TimeToPtr(10 * time.Minute), ProgressDeadline: helper.TimeToPtr(10 * time.Minute), AutoRevert: helper.BoolToPtr(true), + AutoPromote: helper.BoolToPtr(true), Canary: helper.IntToPtr(1), }, @@ -163,6 +164,7 @@ func TestParse(t *testing.T) { HealthyDeadline: helper.TimeToPtr(1 * time.Minute), ProgressDeadline: helper.TimeToPtr(1 * time.Minute), AutoRevert: helper.BoolToPtr(false), + AutoPromote: helper.BoolToPtr(false), Canary: helper.IntToPtr(2), }, Migrate: &api.MigrateStrategy{ diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 5a9184be8cd..86162cbe22a 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -42,6 +42,7 @@ job "binstore-storagelocker" { healthy_deadline = "10m" progress_deadline = "10m" auto_revert = true + auto_promote = true canary = 1 } @@ -84,6 +85,7 @@ job "binstore-storagelocker" { healthy_deadline = "1m" progress_deadline = "1m" auto_revert = false + auto_promote = false canary = 2 } diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index c798cbcedb6..e41fa2054af 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -265,6 +265,38 @@ func (w *deploymentWatcher) PromoteDeployment( return nil } +// autoPromoteDeployment creates a synthetic promotion request, and upserts it for processing +func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStub) error { + d := w.getDeployment() + if !d.HasPlacedCanaries() || !d.RequiresPromotion() { + return nil + } + + // AutoPromote iff every task group is marked auto_promote and is healthy. The whole + // job version has been incremented, so we promote together. See also AutoRevert + for _, tv := range d.TaskGroups { + if !tv.AutoPromote || tv.DesiredCanaries != len(tv.PlacedCanaries) { + return nil + } + + // Find the health status of each canary + for _, c := range tv.PlacedCanaries { + for _, a := range allocs { + if c == a.ID && !a.DeploymentStatus.IsHealthy() { + return nil + } + } + } + } + + // Send the request + _, err := w.upsertDeploymentPromotion(&structs.ApplyDeploymentPromoteRequest{ + DeploymentPromoteRequest: structs.DeploymentPromoteRequest{DeploymentID: d.GetID(), All: true}, + Eval: w.getEval(), + }) + return err +} + func (w *deploymentWatcher) PauseDeployment( req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { @@ -381,6 +413,7 @@ FAIL: for { select { case <-w.ctx.Done(): + // This is the successful case, and we stop the loop return case <-deadlineTimer.C: // We have hit the progress deadline so fail the deployment. We need @@ -460,6 +493,12 @@ FAIL: break FAIL } + // If permitted, automatically promote this canary deployment + err = w.autoPromoteDeployment(updates.allocs) + if err != nil { + w.logger.Error("failed to auto promote deployment", "error", err) + } + // Create an eval to push the deployment along if res.createEval || len(res.allowReplacements) != 0 { w.createBatchedUpdate(res.allowReplacements, allocIndex) @@ -779,8 +818,10 @@ type allocUpdates struct { err error } -// getAllocsCh retrieves the allocations that are part of the deployment blocking -// at the given index. +// getAllocsCh creates a channel and starts a goroutine that +// 1. parks a blocking query for allocations on the state +// 2. reads those and drops them on the channel +// This query runs once here, but watch calls it in a loop func (w *deploymentWatcher) getAllocsCh(index uint64) <-chan *allocUpdates { out := make(chan *allocUpdates, 1) go func() { diff --git a/nomad/deploymentwatcher/deployments_watcher.go b/nomad/deploymentwatcher/deployments_watcher.go index 8cd695fcc66..494d5c16d5b 100644 --- a/nomad/deploymentwatcher/deployments_watcher.go +++ b/nomad/deploymentwatcher/deployments_watcher.go @@ -221,7 +221,7 @@ func (w *Watcher) add(d *structs.Deployment) error { } // addLocked adds a deployment to the watch list and should only be called when -// locked. +// locked. Creating the deploymentWatcher starts a go routine to .watch() it func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { // Not enabled so no-op if !w.enabled { diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index a92a8e029a4..57d59da78cd 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -486,6 +486,7 @@ func TestWatcher_PromoteDeployment_UnhealthyCanaries(t *testing.T) { var resp structs.DeploymentUpdateResponse err := w.PromoteDeployment(req, &resp) if assert.NotNil(t, err, "PromoteDeployment") { + // 0/2 because the old version has been stopped but the canary isn't marked healthy yet require.Contains(err.Error(), `Task group "web" has 0/2 healthy allocations`, "Should error because canary isn't marked healthy") } @@ -493,6 +494,126 @@ func TestWatcher_PromoteDeployment_UnhealthyCanaries(t *testing.T) { m.AssertCalled(t, "UpdateDeploymentPromotion", mocker.MatchedBy(matcher)) } +func TestWatcher_AutoPromoteDeployment(t *testing.T) { + t.Parallel() + w, m := defaultTestDeploymentWatcher(t) + now := time.Now() + + // Create 1 UpdateStrategy, 1 job (1 TaskGroup), 2 canaries, and 1 deployment + upd := structs.DefaultUpdateStrategy.Copy() + upd.AutoPromote = true + upd.MaxParallel = 2 + upd.Canary = 2 + upd.ProgressDeadline = 5 * time.Second + + j := mock.Job() + j.TaskGroups[0].Update = upd + + d := mock.Deployment() + d.JobID = j.ID + // This is created in scheduler.computeGroup at runtime, where properties from the + // UpdateStrategy are copied in + d.TaskGroups = map[string]*structs.DeploymentState{ + "web": { + AutoPromote: upd.AutoPromote, + AutoRevert: upd.AutoRevert, + ProgressDeadline: upd.ProgressDeadline, + DesiredTotal: 2, + }, + } + + alloc := func() *structs.Allocation { + a := mock.Alloc() + a.DeploymentID = d.ID + a.CreateTime = now.UnixNano() + a.ModifyTime = now.UnixNano() + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + } + return a + } + + a := alloc() + b := alloc() + + d.TaskGroups[a.TaskGroup].PlacedCanaries = []string{a.ID, b.ID} + d.TaskGroups[a.TaskGroup].DesiredCanaries = 2 + require.NoError(t, m.state.UpsertJob(m.nextIndex(), j), "UpsertJob") + require.NoError(t, m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment") + require.NoError(t, m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a, b}), "UpsertAllocs") + + // ============================================================= + // Support method calls + matchConfig0 := &matchDeploymentStatusUpdateConfig{ + DeploymentID: d.ID, + Status: structs.DeploymentStatusFailed, + StatusDescription: structs.DeploymentStatusDescriptionProgressDeadline, + Eval: true, + } + matcher0 := matchDeploymentStatusUpdateRequest(matchConfig0) + m.On("UpdateDeploymentStatus", mocker.MatchedBy(matcher0)).Return(nil) + + matchConfig1 := &matchDeploymentAllocHealthRequestConfig{ + DeploymentID: d.ID, + Healthy: []string{a.ID, b.ID}, + Eval: true, + } + matcher1 := matchDeploymentAllocHealthRequest(matchConfig1) + m.On("UpdateDeploymentAllocHealth", mocker.MatchedBy(matcher1)).Return(nil) + + matchConfig2 := &matchDeploymentPromoteRequestConfig{ + Promotion: &structs.DeploymentPromoteRequest{ + DeploymentID: d.ID, + All: true, + }, + Eval: true, + } + matcher2 := matchDeploymentPromoteRequest(matchConfig2) + m.On("UpdateDeploymentPromotion", mocker.MatchedBy(matcher2)).Return(nil) + // ============================================================= + + // Start the deployment + w.SetEnabled(true, m.state) + testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil }, + func(err error) { require.Equal(t, 1, len(w.watchers), "Should have 1 deployment") }) + + // Mark the canaries healthy + req := &structs.DeploymentAllocHealthRequest{ + DeploymentID: d.ID, + HealthyAllocationIDs: []string{a.ID, b.ID}, + } + var resp structs.DeploymentUpdateResponse + // Calls w.raft.UpdateDeploymentAllocHealth, which is implemented by StateStore in + // state.UpdateDeploymentAllocHealth via a raft shim? + err := w.SetAllocHealth(req, &resp) + require.NoError(t, err) + + ws := memdb.NewWatchSet() + + testutil.WaitForResult( + func() (bool, error) { + ds, _ := m.state.DeploymentsByJobID(ws, j.Namespace, j.ID, true) + d = ds[0] + return 2 == d.TaskGroups["web"].HealthyAllocs, nil + }, + func(err error) { require.NoError(t, err) }, + ) + + require.Equal(t, 1, len(w.watchers), "Deployment should still be active") + m.AssertCalled(t, "UpdateDeploymentPromotion", mocker.MatchedBy(matcher2)) + + require.Equal(t, "running", d.Status) + require.True(t, d.TaskGroups["web"].Promoted) + + a1, _ := m.state.AllocByID(ws, a.ID) + require.False(t, a1.DeploymentStatus.Canary) + require.Equal(t, "pending", a1.ClientStatus) + require.Equal(t, "run", a1.DesiredStatus) + + b1, _ := m.state.AllocByID(ws, b.ID) + require.False(t, b1.DeploymentStatus.Canary) +} + // Test pausing a deployment that is running func TestWatcher_PauseDeployment_Pause_Running(t *testing.T) { t.Parallel() diff --git a/nomad/deploymentwatcher/doc.go b/nomad/deploymentwatcher/doc.go new file mode 100644 index 00000000000..e1f366b3c76 --- /dev/null +++ b/nomad/deploymentwatcher/doc.go @@ -0,0 +1,7 @@ +// deploymentwatcher creates and tracks Deployments, which hold meta data describing the +// process of upgrading a running job to a new set of Allocations. This encompasses settings +// for canary deployments and blue/green rollouts. +// +// - The watcher is only enabled on the active raft leader. +// - func (w *deploymentWatcher) watch() is the main deploymentWatcher process +package deploymentwatcher diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 8bb3b3352e5..c413ff4721d 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -2942,7 +2942,7 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD } } - // For each promotable allocation remoce the canary field + // For each promotable allocation remove the canary field for _, alloc := range promotable { promoted := alloc.Copy() promoted.DeploymentStatus.Canary = false diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index a5bf9a1bd22..3dbe77c88d9 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -57,8 +57,7 @@ type JobDiff struct { // diffable. If contextual diff is enabled, objects within the job will contain // field information even if unchanged. func (j *Job) Diff(other *Job, contextual bool) (*JobDiff, error) { - // COMPAT: Remove "Update" in 0.7.0. Update pushed down to task groups - // in 0.6.0 + // See agent.ApiJobToStructJob Update is a default for TaskGroups diff := &JobDiff{Type: DiffTypeNone} var oldPrimitiveFlat, newPrimitiveFlat map[string]string filter := []string{"ID", "Status", "StatusDescription", "Version", "Stable", "CreateIndex", diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 42321a41312..dbdd83c7fb6 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -1977,6 +1977,12 @@ func TestTaskGroupDiff(t *testing.T) { Type: DiffTypeDeleted, Name: "Update", Fields: []*FieldDiff{ + { + Type: DiffTypeDeleted, + Name: "AutoPromote", + Old: "false", + New: "", + }, { Type: DiffTypeDeleted, Name: "AutoRevert", @@ -2033,6 +2039,12 @@ func TestTaskGroupDiff(t *testing.T) { Type: DiffTypeAdded, Name: "Update", Fields: []*FieldDiff{ + { + Type: DiffTypeAdded, + Name: "AutoPromote", + Old: "", + New: "false", + }, { Type: DiffTypeAdded, Name: "AutoRevert", @@ -2084,6 +2096,7 @@ func TestTaskGroupDiff(t *testing.T) { HealthyDeadline: 30 * time.Second, ProgressDeadline: 29 * time.Second, AutoRevert: true, + AutoPromote: true, Canary: 2, }, }, @@ -2095,6 +2108,7 @@ func TestTaskGroupDiff(t *testing.T) { HealthyDeadline: 31 * time.Second, ProgressDeadline: 32 * time.Second, AutoRevert: false, + AutoPromote: false, Canary: 1, }, }, @@ -2105,6 +2119,12 @@ func TestTaskGroupDiff(t *testing.T) { Type: DiffTypeEdited, Name: "Update", Fields: []*FieldDiff{ + { + Type: DiffTypeEdited, + Name: "AutoPromote", + Old: "true", + New: "false", + }, { Type: DiffTypeEdited, Name: "AutoRevert", @@ -2163,6 +2183,7 @@ func TestTaskGroupDiff(t *testing.T) { HealthyDeadline: 30 * time.Second, ProgressDeadline: 30 * time.Second, AutoRevert: true, + AutoPromote: true, Canary: 2, }, }, @@ -2174,6 +2195,7 @@ func TestTaskGroupDiff(t *testing.T) { HealthyDeadline: 30 * time.Second, ProgressDeadline: 30 * time.Second, AutoRevert: true, + AutoPromote: true, Canary: 2, }, }, @@ -2184,6 +2206,12 @@ func TestTaskGroupDiff(t *testing.T) { Type: DiffTypeEdited, Name: "Update", Fields: []*FieldDiff{ + { + Type: DiffTypeNone, + Name: "AutoPromote", + Old: "true", + New: "true", + }, { Type: DiffTypeNone, Name: "AutoRevert", diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 33c81dd7e24..fead1a641d3 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3231,7 +3231,8 @@ type Job struct { // to run. Each task group is an atomic unit of scheduling and placement. TaskGroups []*TaskGroup - // COMPAT: Remove in 0.7.0. Stagger is deprecated in 0.6.0. + // See agent.ApiJobToStructJob + // Update provides defaults for the TaskGroup Update stanzas Update UpdateStrategy // Periodic is used to define the interval the job is run at. @@ -3481,11 +3482,21 @@ func (j *Job) Warnings() error { var mErr multierror.Error // Check the groups + ap := 0 for _, tg := range j.TaskGroups { if err := tg.Warnings(j); err != nil { outer := fmt.Errorf("Group %q has warnings: %v", tg.Name, err) mErr.Errors = append(mErr.Errors, outer) } + if tg.Update != nil && tg.Update.AutoPromote { + ap += 1 + } + } + + // Check AutoPromote, should be all or none + if ap > 0 && ap < len(j.TaskGroups) { + err := fmt.Errorf("auto_promote must be true for all groups to enable automatic promotion") + mErr.Errors = append(mErr.Errors, err) } return mErr.ErrorOrNil() @@ -3804,6 +3815,7 @@ var ( HealthyDeadline: 5 * time.Minute, ProgressDeadline: 10 * time.Minute, AutoRevert: false, + AutoPromote: false, Canary: 0, } ) @@ -3842,6 +3854,10 @@ type UpdateStrategy struct { // stable version. AutoRevert bool + // AutoPromote declares that the deployment should be promoted when all canaries are + // healthy + AutoPromote bool + // Canary is the number of canaries to deploy when a change to the task // group is detected. Canary int @@ -3875,6 +3891,9 @@ func (u *UpdateStrategy) Validate() error { if u.Canary < 0 { multierror.Append(&mErr, fmt.Errorf("Canary count can not be less than zero: %d < 0", u.Canary)) } + if u.Canary == 0 && u.AutoPromote { + multierror.Append(&mErr, fmt.Errorf("Auto Promote requires a Canary count greater than zero")) + } if u.MinHealthyTime < 0 { multierror.Append(&mErr, fmt.Errorf("Minimum healthy time may not be less than zero: %v", u.MinHealthyTime)) } @@ -6994,10 +7013,13 @@ const ( DeploymentStatusSuccessful = "successful" DeploymentStatusCancelled = "cancelled" + // TODO Statuses and Descriptions do not match 1:1 and we sometimes use the Description as a status flag + // DeploymentStatusDescriptions are the various descriptions of the states a // deployment can be in. DeploymentStatusDescriptionRunning = "Deployment is running" - DeploymentStatusDescriptionRunningNeedsPromotion = "Deployment is running but requires promotion" + DeploymentStatusDescriptionRunningNeedsPromotion = "Deployment is running but requires manual promotion" + DeploymentStatusDescriptionRunningAutoPromotion = "Deployment is running pending automatic promotion" DeploymentStatusDescriptionPaused = "Deployment is paused" DeploymentStatusDescriptionSuccessful = "Deployment completed successfully" DeploymentStatusDescriptionStoppedJob = "Cancelled because job is stopped" @@ -7148,6 +7170,19 @@ func (d *Deployment) RequiresPromotion() bool { return false } +// HasAutoPromote determines if all taskgroups are marked auto_promote +func (d *Deployment) HasAutoPromote() bool { + if d == nil || len(d.TaskGroups) == 0 || d.Status != DeploymentStatusRunning { + return false + } + for _, group := range d.TaskGroups { + if !group.AutoPromote { + return false + } + } + return true +} + func (d *Deployment) GoString() string { base := fmt.Sprintf("Deployment ID %q for job %q has status %q (%v):", d.ID, d.JobID, d.Status, d.StatusDescription) for group, state := range d.TaskGroups { @@ -7162,6 +7197,10 @@ type DeploymentState struct { // reverted on failure AutoRevert bool + // AutoPromote marks promotion triggered automatically by healthy canaries + // copied from TaskGroup UpdateStrategy in scheduler.reconcile + AutoPromote bool + // ProgressDeadline is the deadline by which an allocation must transition // to healthy before the deployment is considered failed. ProgressDeadline time.Duration @@ -7202,6 +7241,7 @@ func (d *DeploymentState) GoString() string { base += fmt.Sprintf("\n\tHealthy: %d", d.HealthyAllocs) base += fmt.Sprintf("\n\tUnhealthy: %d", d.UnhealthyAllocs) base += fmt.Sprintf("\n\tAutoRevert: %v", d.AutoRevert) + base += fmt.Sprintf("\n\tAutoPromote: %v", d.AutoPromote) return base } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 186e1268eeb..ba34e31ef37 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -144,6 +144,25 @@ func TestJob_Warnings(t *testing.T) { }, }, }, + { + Name: "AutoPromote mixed TaskGroups", + Expected: []string{"auto_promote must be true for all groups"}, + Job: &Job{ + Type: JobTypeService, + TaskGroups: []*TaskGroup{ + { + Update: &UpdateStrategy{ + AutoPromote: true, + }, + }, + { + Update: &UpdateStrategy{ + AutoPromote: false, + }, + }, + }, + }, + }, } for _, c := range cases { diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 43a73a2109c..65262ff938c 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -218,7 +218,11 @@ func (a *allocReconciler) Compute() *reconcileResults { // Set the description of a created deployment if d := a.result.deployment; d != nil { if d.RequiresPromotion() { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + if d.HasAutoPromote() { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion + } else { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + } } } @@ -327,6 +331,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { dstate = &structs.DeploymentState{} if tg.Update != nil { dstate.AutoRevert = tg.Update.AutoRevert + dstate.AutoPromote = tg.Update.AutoPromote dstate.ProgressDeadline = tg.Update.ProgressDeadline } } diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md index b0a26fcaf87..2b6f2fb54f5 100644 --- a/website/source/api/json-jobs.html.md +++ b/website/source/api/json-jobs.html.md @@ -691,6 +691,9 @@ determined. The potential values are: they can be promoted which unblocks a rolling update of the remaining allocations at a rate of `max_parallel`. +- `AutoPromote` - Specifies if the job should automatically promote to + the new deployment if all canaries become healthy. + - `Stagger` - Specifies the delay between migrating allocations off nodes marked for draining. @@ -704,6 +707,7 @@ An example `Update` block: "MinHealthyTime": 15000000000, "HealthyDeadline": 180000000000, "AutoRevert": false, + "AutoPromote": false, "Canary": 1 } } diff --git a/website/source/docs/job-specification/update.html.md b/website/source/docs/job-specification/update.html.md index d2acd86461b..e0e2ca3be83 100644 --- a/website/source/docs/job-specification/update.html.md +++ b/website/source/docs/job-specification/update.html.md @@ -37,6 +37,7 @@ job "docs" { healthy_deadline = "5m" progress_deadline = "10m" auto_revert = true + auto_promote = true canary = 1 stagger = "30s" } @@ -92,6 +93,11 @@ job "docs" { last stable job on deployment failure. A job is marked as stable if all the allocations as part of its deployment were marked healthy. +- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to the + canary version when all canaries become healthy during a deployment. Defaults to + false which means canaries must be manually updated with the `nomad deployment promote` + command. + - `canary` `(int: 0)` - Specifies that changes to the job that would result in destructive updates should create the specified number of canaries without stopping any previous allocations. Once the operator determines the canaries diff --git a/website/source/guides/operating-a-job/update-strategies/blue-green-and-canary-deployments.html.md b/website/source/guides/operating-a-job/update-strategies/blue-green-and-canary-deployments.html.md index c811c1194ba..02e4241513e 100644 --- a/website/source/guides/operating-a-job/update-strategies/blue-green-and-canary-deployments.html.md +++ b/website/source/guides/operating-a-job/update-strategies/blue-green-and-canary-deployments.html.md @@ -45,6 +45,7 @@ job "docs" { min_healthy_time = "30s" healthy_deadline = "10m" auto_revert = true + auto_promote = false } task "api-server" { @@ -305,6 +306,7 @@ job "docs" { min_healthy_time = "30s" healthy_deadline = "10m" auto_revert = true + auto_promote = false } task "api-server" {