From a588e733ecabb4f85ff0e906ed1278263f8d877f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Seux?= Date: Sat, 7 Oct 2017 10:17:08 +0200 Subject: [PATCH] Implement /v1/agent/health/service/ endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This endpoint aggregate all checks related to on the agent and return an appropriate http code + the string describing the worst check. This allows to cleanly expose service status to other component, hiding complexity of multiple checks. This is especially useful to use consul to feed a loadbalancer which would deleguate healthchecking to consul agent. Exposing this endpoint on the agent is necessary to avoid a hit on consul servers and avoid decreasing resiliency (this endpoint will work even if there is no consul leader in the cluster). Fix #2488, relates to #802 Change-Id: Ib340c62bbbba46fd4256ed31474d8ffb1762d4df Signed-off-by: Grégoire Seux --- agent/agent_endpoint.go | 51 ++++++++++++ agent/agent_endpoint_test.go | 157 +++++++++++++++++++++++++++++++++++ agent/http.go | 1 + 3 files changed, 209 insertions(+) diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 7c2276452b8e..dae2374d3906 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -450,6 +450,57 @@ func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Reques return nil, nil } +func (s *HTTPServer) AgentHealthService(resp http.ResponseWriter, req *http.Request) (interface{}, error) { + if req.Method != "GET" { + return nil, MethodNotAllowedError{req.Method, []string{"GET"}} + } + + // Pull out the service id (service id since there may be several instance of the same service on this host) + serviceID := strings.TrimPrefix(req.URL.Path, "/v1/agent/health/service/") + if serviceID == "" { + resp.WriteHeader(http.StatusBadRequest) + fmt.Fprint(resp, "Missing service id") + return nil, nil + } + + checks := s.agent.state.Checks() + // TODO: should we filter using acls like in AgentChecks method? + serviceChecks := make(api.HealthChecks, 0) + for _, c := range checks { + if c.ServiceID == serviceID { + // TODO: harmonize struct.HealthCheck and api.HealthCheck (or at least extract conversion function) + healthCheck := &api.HealthCheck{ + Node: c.Node, + CheckID: string(c.CheckID), + Name: c.Name, + Status: c.Status, + Notes: c.Notes, + Output: c.Output, + ServiceID: c.ServiceID, + ServiceName: c.ServiceName, + ServiceTags: c.ServiceTags, + } + serviceChecks = append(serviceChecks, healthCheck) + } + } + if len(serviceChecks) == 0 { + resp.WriteHeader(http.StatusBadRequest) + fmt.Fprintf(resp, "Invalid serviceID %s", serviceID) + return nil, nil + } + status := serviceChecks.AggregatedStatus() + switch status { + case api.HealthWarning: + resp.WriteHeader(http.StatusTooManyRequests) + case api.HealthPassing: + resp.WriteHeader(http.StatusOK) + default: + resp.WriteHeader(http.StatusServiceUnavailable) + } + fmt.Fprint(resp, status) + return nil, nil +} + func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) { var args structs.ServiceDefinition // Fixup the type decode of TTL or Interval if a check if provided. diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index ff77b5597535..4ae37f3d5363 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -131,6 +131,163 @@ func TestAgent_Checks(t *testing.T) { } } +func TestAgent_Health_Service(t *testing.T) { + t.Parallel() + a := NewTestAgent(t.Name(), "") + defer a.Shutdown() + + service := &structs.NodeService{ + ID: "mysql", + Service: "mysql", + } + if err := a.AddService(service, nil, false, ""); err != nil { + t.Fatalf("err: %v", err) + } + service = &structs.NodeService{ + ID: "mysql2", + Service: "mysql2", + } + if err := a.AddService(service, nil, false, ""); err != nil { + t.Fatalf("err: %v", err) + } + service = &structs.NodeService{ + ID: "mysql3", + Service: "mysql3", + } + if err := a.AddService(service, nil, false, ""); err != nil { + t.Fatalf("err: %v", err) + } + + chk1 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql", + Name: "mysql", + ServiceID: "mysql", + Status: api.HealthPassing, + } + err := a.state.AddCheck(chk1, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + chk2 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql", + Name: "mysql", + ServiceID: "mysql", + Status: api.HealthPassing, + } + err = a.state.AddCheck(chk2, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + chk3 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql2", + Name: "mysql2", + ServiceID: "mysql2", + Status: api.HealthPassing, + } + err = a.state.AddCheck(chk3, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + chk4 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql2", + Name: "mysql2", + ServiceID: "mysql2", + Status: api.HealthWarning, + } + err = a.state.AddCheck(chk4, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + chk5 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql3", + Name: "mysql3", + ServiceID: "mysql3", + Status: api.HealthMaint, + } + err = a.state.AddCheck(chk5, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + chk6 := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "mysql3", + Name: "mysql3", + ServiceID: "mysql3", + Status: api.HealthCritical, + } + err = a.state.AddCheck(chk6, "") + if err != nil { + t.Fatalf("Err: %v", err) + } + + t.Run("passing checks", func(t *testing.T) { + req, _ := http.NewRequest("GET", "/v1/agent/health/service/mysql", nil) + resp := httptest.NewRecorder() + _, err := a.srv.AgentHealthService(resp, req) + if err != nil { + t.Fatalf("Err: %v", err) + } + if got, want := resp.Code, 200; got != want { + t.Fatalf("returned bad status: %d. Body: %q", resp.Code, resp.Body.String()) + } + if got, want := resp.Body.String(), "passing"; got != want { + t.Fatalf("got body %q want %q", got, want) + } + }) + t.Run("warning checks", func(t *testing.T) { + req, _ := http.NewRequest("GET", "/v1/agent/health/service/mysql2", nil) + resp := httptest.NewRecorder() + _, err := a.srv.AgentHealthService(resp, req) + if err != nil { + t.Fatalf("Err: %v", err) + } + if got, want := resp.Code, 429; got != want { + t.Fatalf("returned bad status: %d. Body: %q", resp.Code, resp.Body.String()) + } + if got, want := resp.Body.String(), "warning"; got != want { + t.Fatalf("got body %q want %q", got, want) + } + }) + t.Run("critical checks", func(t *testing.T) { + req, _ := http.NewRequest("GET", "/v1/agent/health/service/mysql3", nil) + resp := httptest.NewRecorder() + _, err := a.srv.AgentHealthService(resp, req) + if err != nil { + t.Fatalf("Err: %v", err) + } + if got, want := resp.Code, 503; got != want { + t.Fatalf("returned bad status: %d. Body: %q", resp.Code, resp.Body.String()) + } + if got, want := resp.Body.String(), "critical"; got != want { + t.Fatalf("got body %q want %q", got, want) + } + }) + t.Run("unknown serviceid", func(t *testing.T) { + req, _ := http.NewRequest("GET", "/v1/agent/health/service/mysql1", nil) + resp := httptest.NewRecorder() + _, err := a.srv.AgentHealthService(resp, req) + if err != nil { + t.Fatalf("Err: %v", err) + } + if got, want := resp.Code, 400; got != want { + t.Fatalf("returned bad status: %d. Body: %q", resp.Code, resp.Body.String()) + } + if got, want := resp.Body.String(), "Invalid serviceID mysql1"; got != want { + t.Fatalf("got body %q want %q", got, want) + } + }) +} + func TestAgent_Checks_ACLFilter(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), TestACLConfig()) diff --git a/agent/http.go b/agent/http.go index 10527ea86d09..b589ced470c3 100644 --- a/agent/http.go +++ b/agent/http.go @@ -106,6 +106,7 @@ func (s *HTTPServer) handler(enableDebug bool) http.Handler { handleFuncMetrics("/v1/agent/join/", s.wrap(s.AgentJoin)) handleFuncMetrics("/v1/agent/leave", s.wrap(s.AgentLeave)) handleFuncMetrics("/v1/agent/force-leave/", s.wrap(s.AgentForceLeave)) + handleFuncMetrics("/v1/agent/health/service/", s.wrap(s.AgentHealthService)) handleFuncMetrics("/v1/agent/check/register", s.wrap(s.AgentRegisterCheck)) handleFuncMetrics("/v1/agent/check/deregister/", s.wrap(s.AgentDeregisterCheck)) handleFuncMetrics("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass))