From 8709213d6ed13d0f5ccd3e2c4523ee0a5be5b9f8 Mon Sep 17 00:00:00 2001 From: Aestek Date: Mon, 7 Jan 2019 19:53:03 +0100 Subject: [PATCH] Prevent status flap when re-registering a check (#4904) Fixes point `#2` of: https://github.com/hashicorp/consul/issues/4903 When registering a service each healthcheck status is saved and restored (https://github.com/hashicorp/consul/blob/master/agent/agent.go#L1914) to avoid unnecessary flaps in health state. This change extends this feature to single check registration by moving this protection in `AddCheck()` so that both `PUT /v1/agent/service/register` and `PUT /v1/agent/check/register` behave in the same idempotent way. #### Steps to reproduce 1. Register a check : ``` curl -X PUT \ http://127.0.0.1:8500/v1/agent/check/register \ -H 'Content-Type: application/json' \ -d '{ "Name": "my_check", "ServiceID": "srv", "Interval": "10s", "Args": ["true"] }' ``` 2. The check will initialize and change to `passing` 3. Run the same request again 4. The check status will quickly go from `critical` to `passing` (the delay for this transission is determined by https://github.com/hashicorp/consul/blob/master/agent/checks/check.go#L95) --- agent/agent.go | 13 ++++++++----- agent/agent_test.go | 45 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 289653cd1bf2..9e161b3f0bd9 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1908,11 +1908,6 @@ func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.Che a.PauseSync() defer a.ResumeSync() - // Take a snapshot of the current state of checks (if any), and - // restore them before resuming anti-entropy. - snap := a.snapshotCheckState() - defer a.restoreCheckState(snap) - // Add the service a.State.AddService(service, token) @@ -2052,6 +2047,14 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, a.checkLock.Lock() defer a.checkLock.Unlock() + // snapshot the current state of the health check to avoid potential flapping + existing := a.State.Check(check.CheckID) + defer func() { + if existing != nil { + a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output) + } + }() + // Check if already registered if chkType != nil { switch { diff --git a/agent/agent_test.go b/agent/agent_test.go index 7551a597704e..7db415343e37 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2467,7 +2467,7 @@ func TestAgent_Service_NoReap(t *testing.T) { } } -func TestAgent_addCheck_restoresSnapshot(t *testing.T) { +func TestAgent_AddService_restoresSnapshot(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), "") defer a.Shutdown() @@ -2510,6 +2510,49 @@ func TestAgent_addCheck_restoresSnapshot(t *testing.T) { } } +func TestAgent_AddCheck_restoresSnapshot(t *testing.T) { + t.Parallel() + a := NewTestAgent(t.Name(), "") + defer a.Shutdown() + + // First register a service + svc := &structs.NodeService{ + ID: "redis", + Service: "redis", + Tags: []string{"foo"}, + Port: 8000, + } + if err := a.AddService(svc, nil, false, "", ConfigSourceLocal); err != nil { + t.Fatalf("err: %v", err) + } + + // Register a check + check1 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "service:redis", + Name: "redischeck", + Status: api.HealthPassing, + ServiceID: "redis", + ServiceName: "redis", + } + if err := a.AddCheck(check1, nil, false, "", ConfigSourceLocal); err != nil { + t.Fatalf("err: %s", err) + } + + // Re-registering the check preserves its state + check1.Status = "" + if err := a.AddCheck(check1, &structs.CheckType{TTL: 30 * time.Second}, false, "", ConfigSourceLocal); err != nil { + t.Fatalf("err: %s", err) + } + check, ok := a.State.Checks()["service:redis"] + if !ok { + t.Fatalf("missing check") + } + if check.Status != api.HealthPassing { + t.Fatalf("bad: %s", check.Status) + } +} + func TestAgent_NodeMaintenanceMode(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), "")