diff --git a/CHANGELOG.md b/CHANGELOG.md index 078db1238..1b302d092 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,15 @@ Thanks to [@murphymj25](https://github.com/murphymj25) for the patch. +* [PR #664](https://github.com/fabiolb/fabio/issues/664): Clean-up fabio service entries in Consul on dirty exit + + In the case fabio dies abruptly, the steps to deregister any fabio-related services in Consul will not take place. + In certain cases this could result in duplicate service and check entries after fabio restarts, especially if fabio runs in a Docker container. + + This PR addresses this issue by registering an additional TTL check that acts as a deadman switch and removes abandoned service registrations eventually. + + Thanks to [@pires](https://github.com/pires) for the [report](https://github.com/fabiolb/fabio/issues/663) and the fix. + * [PR #669](https://github.com/fabiolb/fabio/pull/669): Add option for downgrading tracing IDs to 64 bit When tracing is enabled, fabio injected 128 bit root span IDs if necessary. diff --git a/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md b/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md new file mode 100644 index 000000000..f8e7c1017 --- /dev/null +++ b/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md @@ -0,0 +1,16 @@ +--- +title: "registry.consul.register.deregisterCriticalServiceAfter" +--- + +`registry.consul.register.deregisterCriticalServiceAfter` configures the time the +service health check is allowed to be in state `critical` until Consul automatically +deregisters it. +If fabio is still running, the service will be re-registered almost immediately after +being deleted by Consul. + +At the time of this writing, Consul enforces a minimum value of one minute and runs +its reaper process every thirty seconds. + +The default is + + registry.consul.register.deregisterCriticalServiceAfter = 90m \ No newline at end of file diff --git a/registry/consul/backend.go b/registry/consul/backend.go index 26dc8852b..cfff930ef 100644 --- a/registry/consul/backend.go +++ b/registry/consul/backend.go @@ -99,11 +99,10 @@ func (b *be) Deregister(service string) error { func (b *be) DeregisterAll() error { log.Printf("[DEBUG]: consul: Deregistering all registered aliases.") - for name, dereg := range b.dereg { + for _, dereg := range b.dereg { if dereg == nil { continue } - log.Printf("[INFO] consul: Deregistering %q", name) dereg <- true // trigger deregistration <-dereg // wait for completion } diff --git a/registry/consul/register.go b/registry/consul/register.go index e8aca4834..91251eee8 100644 --- a/registry/consul/register.go +++ b/registry/consul/register.go @@ -14,6 +14,12 @@ import ( "github.com/hashicorp/consul/api" ) +const ( + TTLInterval = time.Second * 15 + TTLRefreshInterval = time.Second * 10 + TTLDeregisterCriticalServiceAfter = time.Minute +) + // register keeps a service registered in consul. // // When a value is sent in the dereg channel the service is deregistered from @@ -46,22 +52,33 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { log.Printf("[INFO] consul: Registered fabio with id %q", service.ID) log.Printf("[INFO] consul: Registered fabio with address %q", service.Address) log.Printf("[INFO] consul: Registered fabio with tags %q", strings.Join(service.Tags, ",")) - log.Printf("[INFO] consul: Registered fabio with health check to %q", service.Check.HTTP) + for _, check := range service.Checks { + log.Printf("[INFO] consul: Registered fabio with check %+v", check) + } return service.ID } deregister := func(serviceID string) { - log.Printf("[INFO] consul: Deregistering %s", service.Name) + log.Printf("[INFO] consul: Deregistering %q", service.Name) c.Agent().ServiceDeregister(serviceID) } + passTTL := func(serviceTTLID string) { + c.Agent().UpdateTTL(serviceTTLID, "", api.HealthPassing) + } + dereg := make(chan bool) go func() { var serviceID string + var serviceTTLCheckId string + for { if !registered(serviceID) { serviceID = register() + serviceTTLCheckId = computeServiceTTLCheckId(serviceID) + // Pass the TTL check right now so traffic can be served immediately. + passTTL(serviceTTLCheckId) } select { @@ -69,8 +86,9 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { deregister(serviceID) dereg <- true return - case <-time.After(time.Second): - // continue + case <-time.After(TTLRefreshInterval): + // Reset the TTL check clock. + passTTL(serviceTTLCheckId) } } }() @@ -115,14 +133,39 @@ func serviceRegistration(cfg *config.Consul, serviceName string) (*api.AgentServ Address: ip.String(), Port: port, Tags: cfg.ServiceTags, - Check: &api.AgentServiceCheck{ - HTTP: checkURL, - Interval: cfg.CheckInterval.String(), - Timeout: cfg.CheckTimeout.String(), - TLSSkipVerify: cfg.CheckTLSSkipVerify, - DeregisterCriticalServiceAfter: cfg.CheckDeregisterCriticalServiceAfter, + // Set the checks for the service. + // + // Both checks must pass for Consul to consider the service healthy and therefore serve the fabio instance to clients. + Checks: []*api.AgentServiceCheck{ + // If fabio doesn't exit cleanly, it doesn't auto-deregister itself from Consul. + // In order to address this, we introduce a TTL check to prove the fabio instance is alive and able to route this service. + // The TTL check must be refreshed before its timeout is crossed. + // If the timeout is crossed, the check fails. + // If the check fails, Consul considers this service to have become unhealthy. + // If the check is failing (critical) after DeregisterCriticalServiceAfter is elapsed, the Consul reaper will remove it from Consul. + // For more info, read https://www.consul.io/api/agent/check.html#deregistercriticalserviceafter. + { + CheckID: computeServiceTTLCheckId(serviceID), + TTL: TTLInterval.String(), + DeregisterCriticalServiceAfter: TTLDeregisterCriticalServiceAfter.String(), + }, + // HTTP check is meant to confirm fabio health endpoint is reachable from the Consul agent. + // If the check fails, Consul considers this service to have become unhealthy. + // If the check fails and registry.consul.register.deregisterCriticalServiceAfter is set, the service will be deregistered from Consul. + // For more info, read https://www.consul.io/api/agent/check.html#deregistercriticalserviceafter. + { + HTTP: checkURL, + Interval: cfg.CheckInterval.String(), + Timeout: cfg.CheckTimeout.String(), + TLSSkipVerify: cfg.CheckTLSSkipVerify, + DeregisterCriticalServiceAfter: cfg.CheckDeregisterCriticalServiceAfter, + }, }, } return service, nil } + +func computeServiceTTLCheckId(serviceID string) string { + return strings.Join([]string{serviceID, "ttl"}, "-") +} diff --git a/registry/consul/service.go b/registry/consul/service.go index f9ad1067e..083d6d0ee 100644 --- a/registry/consul/service.go +++ b/registry/consul/service.go @@ -29,7 +29,7 @@ func NewServiceMonitor(client *api.Client, config *config.Consul, dc string) *Se } // Watch monitors the consul health checks and sends a new -// configuration to the updates channnel on every change. +// configuration to the updates channel on every change. func (w *ServiceMonitor) Watch(updates chan string) { var lastIndex uint64 for {