Skip to content

Commit

Permalink
agent: fix 'consul leave' shutdown race (#2880)
Browse files Browse the repository at this point in the history
When the agent is triggered to shutdown via an external 'consul leave'
command delivered via the HTTP API then the client expects to receive a
response when the agent is down. This creates a race on when to shutdown
the agent itself like the RPC server, the checks and the state and the
external endpoints like DNS and HTTP.

This patch splits the shutdown process into two parts:

 * shutdown the agent
 * shutdown the endpoints (http and dns)

They can be executed multiple times, concurrently and in any order but
should be executed first agent, then endpoints to provide consistent
behavior across all use cases. Both calls have to be executed for a
proper shutdown.

This could be partially hidden in a single function but would introduce
some magic that happens behind the scenes which one has to know of but
isn't obvious.

Fixes #2880
  • Loading branch information
magiconair committed Jun 21, 2017
1 parent 7abe308 commit ea5b0f2
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 32 deletions.
64 changes: 36 additions & 28 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1112,9 +1112,10 @@ func (a *Agent) Leave() error {
return a.delegate.Leave()
}

// Shutdown is used to hard stop the agent. Should be
// preceded by a call to Leave to do it gracefully.
func (a *Agent) Shutdown() error {
// ShutdownAgent is used to hard stop the agent. Should be preceded by
// Leave to do it gracefully. Should be followed by ShutdownEndpoints to
// terminate the HTTP and DNS servers as well.
func (a *Agent) ShutdownAgent() error {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()

Expand All @@ -1123,29 +1124,6 @@ func (a *Agent) Shutdown() error {
}
a.logger.Println("[INFO] agent: Requesting shutdown")

// Stop all API endpoints
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
for _, srv := range a.httpServers {
// http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2"
// the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is
// no longer sufficient to check for an HTTPS server.
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)

// graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")

// Stop all the checks
a.checkLock.Lock()
defer a.checkLock.Unlock()
Expand All @@ -1155,11 +1133,9 @@ func (a *Agent) Shutdown() error {
for _, chk := range a.checkTTLs {
chk.Stop()
}

for _, chk := range a.checkHTTPs {
chk.Stop()
}

for _, chk := range a.checkTCPs {
chk.Stop()
}
Expand All @@ -1185,6 +1161,38 @@ func (a *Agent) Shutdown() error {
return err
}

// ShutdownEndpoints terminates the HTTP and DNS servers. Should be
// preceeded by ShutdownAgent.
func (a *Agent) ShutdownEndpoints() {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()

if len(a.dnsServers) == 0 || len(a.httpServers) == 0 {
return
}

for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
a.dnsServers = nil

for _, srv := range a.httpServers {
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.httpServers = nil

a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
}

// ReloadCh is used to return a channel that can be
// used for triggering reloads and returning a response.
func (a *Agent) ReloadCh() chan chan error {
Expand Down
2 changes: 1 addition & 1 deletion agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func (s *HTTPServer) AgentLeave(resp http.ResponseWriter, req *http.Request) (in
if err := s.agent.Leave(); err != nil {
return nil, err
}
return nil, s.agent.Shutdown()
return nil, s.agent.ShutdownAgent()
}

func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
Expand Down
8 changes: 6 additions & 2 deletions agent/testagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ func (a *TestAgent) Start() *TestAgent {
fmt.Println(id, a.Name, "Error starting agent:", err)
runtime.Goexit()
} else {
agent.Shutdown()
agent.ShutdownAgent()
agent.ShutdownEndpoints()
wait := time.Duration(rand.Int31n(2000)) * time.Millisecond
fmt.Println(id, a.Name, "retrying in", wait)
time.Sleep(wait)
Expand Down Expand Up @@ -221,7 +222,10 @@ func (a *TestAgent) Shutdown() error {
os.RemoveAll(a.DataDir)
}
}()
return a.Agent.Shutdown()

// shutdown agent before endpoints
defer a.Agent.ShutdownEndpoints()
return a.Agent.ShutdownAgent()
}

func (a *TestAgent) HTTPAddr() string {
Expand Down
5 changes: 4 additions & 1 deletion command/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,10 @@ func (cmd *AgentCommand) run(args []string) int {
cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err))
return 1
}
defer agent.Shutdown()

// shutdown agent before endpoints
defer agent.ShutdownEndpoints()
defer agent.ShutdownAgent()

if !config.DisableUpdateCheck {
cmd.startupUpdateCheck(config)
Expand Down

0 comments on commit ea5b0f2

Please sign in to comment.