Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUGFIX] Configuration reload does not discard Check's statuses for services #7345

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 11 additions & 18 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ func (a *Agent) Start() error {
a.serviceManager.Start()

// Load checks/services/metadata.
if err := a.loadServices(c); err != nil {
if err := a.loadServices(c, nil); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why we don't need to pass in a snapshot is that the agent just started and it doesn't have any information in memory about anything pretty much.

return err
}
if err := a.loadChecks(c, nil); err != nil {
Expand Down Expand Up @@ -2289,7 +2289,7 @@ func (a *Agent) AddServiceAndReplaceChecks(service *structs.NodeService, chkType
token: token,
replaceExistingChecks: true,
source: source,
})
}, a.snapshotCheckState())
}

// AddService is used to add a service entry.
Expand All @@ -2308,12 +2308,12 @@ func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.Che
token: token,
replaceExistingChecks: false,
source: source,
})
}, a.snapshotCheckState())
}

// addServiceLocked adds a service entry to the service manager if enabled, or directly
// to the local state if it is not. This function assumes the state lock is already held.
func (a *Agent) addServiceLocked(req *addServiceRequest) error {
func (a *Agent) addServiceLocked(req *addServiceRequest, snap map[structs.CheckID]*structs.HealthCheck) error {
req.fixupForAddServiceLocked()

req.service.EnterpriseMeta.Normalize()
Expand All @@ -2331,7 +2331,7 @@ func (a *Agent) addServiceLocked(req *addServiceRequest) error {
req.persistDefaults = nil
req.persistServiceConfig = false

return a.addServiceInternal(req)
return a.addServiceInternal(req, snap)
}

// addServiceRequest is the union of arguments for calling both
Expand Down Expand Up @@ -2369,7 +2369,7 @@ func (r *addServiceRequest) fixupForAddServiceInternal() {
}

// addServiceInternal adds the given service and checks to the local state.
func (a *Agent) addServiceInternal(req *addServiceRequest) error {
func (a *Agent) addServiceInternal(req *addServiceRequest, snap map[structs.CheckID]*structs.HealthCheck) error {
req.fixupForAddServiceInternal()
var (
service = req.service
Expand Down Expand Up @@ -2407,11 +2407,6 @@ func (a *Agent) addServiceInternal(req *addServiceRequest) error {
service.TaggedAddresses[structs.TaggedAddressWANIPv6] = structs.ServiceAddress{Address: service.Address, Port: service.Port}
}

// Take a snapshot of the current state of checks (if any), and when adding
// a check that already existed carry over the state before resuming
// anti-entropy.
snap := a.snapshotCheckState()

var checks []*structs.HealthCheck

// all the checks must be associated with the same enterprise meta of the service
Expand Down Expand Up @@ -3489,7 +3484,7 @@ func (a *Agent) deletePid() error {

// loadServices will load service definitions from configuration and persisted
// definitions on disk, and load them into the local agent.
func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
func (a *Agent) loadServices(conf *config.RuntimeConfig, snap map[structs.CheckID]*structs.HealthCheck) error {
// Load any persisted service configs so we can feed those into the initial
// registrations below.
persistedServiceConfigs, err := a.readPersistedServiceConfigs()
Expand Down Expand Up @@ -3526,7 +3521,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
token: service.Token,
replaceExistingChecks: false, // do default behavior
source: ConfigSourceLocal,
})
}, snap)
if err != nil {
return fmt.Errorf("Failed to register service %q: %v", service.Name, err)
}
Expand All @@ -3544,7 +3539,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
token: sidecarToken,
replaceExistingChecks: false, // do default behavior
source: ConfigSourceLocal,
})
}, snap)
if err != nil {
return fmt.Errorf("Failed to register sidecar for service %q: %v", service.Name, err)
}
Expand Down Expand Up @@ -3636,8 +3631,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
token: p.Token,
replaceExistingChecks: false, // do default behavior
source: source,
},
)
}, snap)
if err != nil {
return fmt.Errorf("failed adding service %q: %s", serviceID, err)
}
Expand Down Expand Up @@ -3672,7 +3666,6 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig, snap map[structs.CheckID]
// Register the checks from config
for _, check := range conf.Checks {
health := check.HealthCheck(conf.NodeName)

// Restore the fields from the snapshot.
if prev, ok := snap[health.CompoundCheckID()]; ok {
health.Output = prev.Output
Expand Down Expand Up @@ -4025,7 +4018,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
}

// Reload service/check definitions and metadata.
if err := a.loadServices(newCfg); err != nil {
if err := a.loadServices(newCfg, snap); err != nil {
return fmt.Errorf("Failed reloading services: %s", err)
}
if err := a.loadChecks(newCfg, snap); err != nil {
Expand Down
50 changes: 49 additions & 1 deletion agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2030,7 +2030,7 @@ func testAgent_persistedService_compat(t *testing.T, extraHCL string) {
}

// Load the services
if err := a.loadServices(a.Config); err != nil {
if err := a.loadServices(a.Config, nil); err != nil {
t.Fatalf("err: %s", err)
}

Expand Down Expand Up @@ -3437,6 +3437,54 @@ func TestAgent_ReloadConfigOutgoingRPCConfig(t *testing.T) {
require.Len(t, tlsConf.ClientCAs.Subjects(), 2)
}

func TestAgent_ReloadConfigAndKeepChecksStatus(t *testing.T) {
t.Parallel()
dataDir := testutil.TempDir(t, "agent") // we manage the data dir
defer os.RemoveAll(dataDir)
waitDurationSeconds := 1
hcl := `data_dir = "` + dataDir + `"
enable_local_script_checks=true
services=[{
name="webserver1",
check{name="check1",
args=["true"],
interval="` + strconv.Itoa(waitDurationSeconds) + `s"}}
]`
a := NewTestAgent(t, t.Name(), hcl)
defer a.Shutdown()

// Initially, state is critical during waitDurationSeconds seconds
retry.Run(t, func(r *retry.R) {
gotChecks := a.State.Checks(nil)
require.Equal(r, 1, len(gotChecks), "Should have a check registered, but had %#v", gotChecks)
for id, check := range gotChecks {
require.Equal(r, "critical", check.Status, "check %q is wrong", id)
}
})
c := TestConfig(testutil.Logger(t), config.Source{Name: t.Name(), Format: "hcl", Data: hcl})
a.ReloadConfig(c)
time.Sleep(time.Duration(waitDurationSeconds) * time.Second)
// It should now be passing
retry.Run(t, func(r *retry.R) {
gotChecks := a.State.Checks(nil)
for id, check := range gotChecks {
require.Equal(r, "passing", check.Status, "check %q is wrong", id)
}
})

require.NoError(t, a.ReloadConfig(c))
// After reload, should be passing directly (no critical state)
for id, check := range a.State.Checks(nil) {
require.Equal(t, "passing", check.Status, "check %q is wrong", id)
}
// Ensure to take reload into account event with async stuff
time.Sleep(time.Duration(100) * time.Millisecond)
// Of course, after a sleep, should be Ok too
for id, check := range a.State.Checks(nil) {
require.Equal(t, "passing", check.Status, "check %q is wrong", id)
}
}

func TestAgent_ReloadConfigIncomingRPCConfig(t *testing.T) {
t.Parallel()
dataDir := testutil.TempDir(t, "agent") // we manage the data dir
Expand Down
6 changes: 3 additions & 3 deletions agent/service_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (s *ServiceManager) registerOnce(args *addServiceRequest) error {
s.agent.stateLock.Lock()
defer s.agent.stateLock.Unlock()

err := s.agent.addServiceInternal(args)
err := s.agent.addServiceInternal(args, s.agent.snapshotCheckState())
if err != nil {
return fmt.Errorf("error updating service registration: %v", err)
}
Expand Down Expand Up @@ -127,7 +127,7 @@ func (s *ServiceManager) AddService(req *addServiceRequest) error {
req.persistService = nil
req.persistDefaults = nil
req.persistServiceConfig = false
return s.agent.addServiceInternal(req)
return s.agent.addServiceInternal(req, s.agent.snapshotCheckState())
}

var (
Expand Down Expand Up @@ -279,7 +279,7 @@ func (w *serviceConfigWatch) RegisterAndStart(
token: w.registration.token,
replaceExistingChecks: w.registration.replaceExistingChecks,
source: w.registration.source,
})
}, w.agent.snapshotCheckState())
if err != nil {
return fmt.Errorf("error updating service registration: %v", err)
}
Expand Down