Skip to content

Commit

Permalink
Prevent status flap when re-registering a check (#4904)
Browse files Browse the repository at this point in the history
Fixes point `#2` of: #4903

When registering a service each healthcheck status is saved and restored (https://github.com/hashicorp/consul/blob/master/agent/agent.go#L1914) to avoid unnecessary flaps in health state.
This change extends this feature to single check registration by moving this protection in `AddCheck()` so that both `PUT /v1/agent/service/register` and `PUT /v1/agent/check/register` behave in the same idempotent way.

#### Steps to reproduce
1. Register a check :
```
curl -X PUT \
  http://127.0.0.1:8500/v1/agent/check/register \
  -H 'Content-Type: application/json' \
  -d '{
  "Name": "my_check",
  "ServiceID": "srv",
  "Interval": "10s",
  "Args": ["true"]
}'
```
2. The check will initialize and change to `passing`
3. Run the same request again
4. The check status will quickly go from `critical` to `passing` (the delay for this transission is determined by https://github.com/hashicorp/consul/blob/master/agent/checks/check.go#L95)
  • Loading branch information
Aestek authored and mkeeler committed Jan 7, 2019
1 parent 516ba47 commit 8709213
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 6 deletions.
13 changes: 8 additions & 5 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1908,11 +1908,6 @@ func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.Che
a.PauseSync()
defer a.ResumeSync()

// Take a snapshot of the current state of checks (if any), and
// restore them before resuming anti-entropy.
snap := a.snapshotCheckState()
defer a.restoreCheckState(snap)

// Add the service
a.State.AddService(service, token)

Expand Down Expand Up @@ -2052,6 +2047,14 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType,
a.checkLock.Lock()
defer a.checkLock.Unlock()

// snapshot the current state of the health check to avoid potential flapping
existing := a.State.Check(check.CheckID)
defer func() {
if existing != nil {
a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output)
}
}()

// Check if already registered
if chkType != nil {
switch {
Expand Down
45 changes: 44 additions & 1 deletion agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2467,7 +2467,7 @@ func TestAgent_Service_NoReap(t *testing.T) {
}
}

func TestAgent_addCheck_restoresSnapshot(t *testing.T) {
func TestAgent_AddService_restoresSnapshot(t *testing.T) {
t.Parallel()
a := NewTestAgent(t.Name(), "")
defer a.Shutdown()
Expand Down Expand Up @@ -2510,6 +2510,49 @@ func TestAgent_addCheck_restoresSnapshot(t *testing.T) {
}
}

func TestAgent_AddCheck_restoresSnapshot(t *testing.T) {
t.Parallel()
a := NewTestAgent(t.Name(), "")
defer a.Shutdown()

// First register a service
svc := &structs.NodeService{
ID: "redis",
Service: "redis",
Tags: []string{"foo"},
Port: 8000,
}
if err := a.AddService(svc, nil, false, "", ConfigSourceLocal); err != nil {
t.Fatalf("err: %v", err)
}

// Register a check
check1 := &structs.HealthCheck{
Node: a.Config.NodeName,
CheckID: "service:redis",
Name: "redischeck",
Status: api.HealthPassing,
ServiceID: "redis",
ServiceName: "redis",
}
if err := a.AddCheck(check1, nil, false, "", ConfigSourceLocal); err != nil {
t.Fatalf("err: %s", err)
}

// Re-registering the check preserves its state
check1.Status = ""
if err := a.AddCheck(check1, &structs.CheckType{TTL: 30 * time.Second}, false, "", ConfigSourceLocal); err != nil {
t.Fatalf("err: %s", err)
}
check, ok := a.State.Checks()["service:redis"]
if !ok {
t.Fatalf("missing check")
}
if check.Status != api.HealthPassing {
t.Fatalf("bad: %s", check.Status)
}
}

func TestAgent_NodeMaintenanceMode(t *testing.T) {
t.Parallel()
a := NewTestAgent(t.Name(), "")
Expand Down

0 comments on commit 8709213

Please sign in to comment.