Skip to content

Commit

Permalink
Merge pull request #1785 from hashicorp/f-check-put-api
Browse files Browse the repository at this point in the history
Adds a PUT-based API for TTL checks and retains output on timeouts.
  • Loading branch information
slackpad committed Mar 3, 2016
2 parents ef1a588 + f46fa33 commit ad13b34
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 14 deletions.
52 changes: 52 additions & 0 deletions command/agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,58 @@ func (s *HTTPServer) AgentCheckFail(resp http.ResponseWriter, req *http.Request)
return nil, nil
}

// checkUpdate is the payload for a PUT to AgentCheckUpdate.
type checkUpdate struct {
// Status us one of the structs.Health* states, "passing", "warning", or
// "critical".
Status string

// Output is the information to post to the UI for operators as the
// output of the process that decided to hit the TTL check. This is
// different from the note field that's associated with the check
// itself.
Output string
}

// AgentCheckUpdate is a PUT-based alternative to the GET-based Pass/Warn/Fail
// APIs.
func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != "PUT" {
resp.WriteHeader(405)
return nil, nil
}

var update checkUpdate
if err := decodeBody(req, &update, nil); err != nil {
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Request decode failed: %v", err)))
return nil, nil
}

switch update.Status {
case structs.HealthPassing:
case structs.HealthWarning:
case structs.HealthCritical:
default:
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Invalid check status: '%s'", update.Status)))
return nil, nil
}

total := len(update.Output)
if total > CheckBufSize {
update.Output = fmt.Sprintf("%s ... (captured %d of %d bytes)",
update.Output[:CheckBufSize], CheckBufSize, total)
}

checkID := strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/")
if err := s.agent.UpdateCheck(checkID, update.Status, update.Output); err != nil {
return nil, err
}
s.syncChanges()
return nil, nil
}

func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args ServiceDefinition
// Fixup the type decode of TTL or Interval if a check if provided
Expand Down
132 changes: 129 additions & 3 deletions command/agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -428,7 +429,6 @@ func TestHTTPAgentPassCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/pass/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand Down Expand Up @@ -461,7 +461,6 @@ func TestHTTPAgentWarnCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/warn/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand Down Expand Up @@ -494,7 +493,6 @@ func TestHTTPAgentFailCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/fail/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand All @@ -515,6 +513,134 @@ func TestHTTPAgentFailCheck(t *testing.T) {
}
}

func TestHTTPAgentUpdateCheck(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()

chk := &structs.HealthCheck{Name: "test", CheckID: "test"}
chkType := &CheckType{TTL: 15 * time.Second}
if err := srv.agent.AddCheck(chk, chkType, false, ""); err != nil {
t.Fatalf("err: %v", err)
}

cases := []checkUpdate{
checkUpdate{"passing", "hello-passing"},
checkUpdate{"critical", "hello-critical"},
checkUpdate{"warning", "hello-warning"},
}

for _, c := range cases {
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
req.Body = encodeReq(c)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}

state := srv.agent.state.Checks()["test"]
if state.Status != c.Status || state.Output != c.Output {
t.Fatalf("bad: %v", state)
}
}

// Make sure abusive levels of output are capped.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "passing",
Output: strings.Repeat("-= bad -=", 5*CheckBufSize),
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}

// Since we append some notes about truncating, we just do a
// rough check that the output buffer was cut down so this test
// isn't super brittle.
state := srv.agent.state.Checks()["test"]
if state.Status != structs.HealthPassing || len(state.Output) > 2*CheckBufSize {
t.Fatalf("bad: %v", state)
}
}

// Check a bogus status.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "itscomplicated",
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
}

// Check a bogus verb.
{
req, err := http.NewRequest("POST", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "passing",
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 405 {
t.Fatalf("expected 405, got %d", resp.Code)
}
}
}

func TestHTTPAgentRegisterService(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
Expand Down
24 changes: 23 additions & 1 deletion command/agent/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@ type CheckTTL struct {

timer *time.Timer

lastOutput string
lastOutputLock sync.RWMutex

stop bool
stopCh chan struct{}
stopLock sync.Mutex
Expand Down Expand Up @@ -265,20 +268,39 @@ func (c *CheckTTL) run() {
case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, "TTL expired")
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, c.getExpiredOutput())

case <-c.stopCh:
return
}
}
}

// getExpiredOutput formats the output for the case when the TTL is expired.
func (c *CheckTTL) getExpiredOutput() string {
c.lastOutputLock.RLock()
defer c.lastOutputLock.RUnlock()

const prefix = "TTL expired"
if c.lastOutput == "" {
return prefix
}

return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput)
}

// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output)

// Store the last output so we can retain it if the TTL expires.
c.lastOutputLock.Lock()
c.lastOutput = output
c.lastOutputLock.Unlock()

c.timer.Reset(c.TTL)
}

Expand Down
7 changes: 6 additions & 1 deletion command/agent/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http/httptest"
"os"
"os/exec"
"strings"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -150,7 +151,7 @@ func TestCheckTTL(t *testing.T) {
defer check.Stop()

time.Sleep(50 * time.Millisecond)
check.SetStatus(structs.HealthPassing, "")
check.SetStatus(structs.HealthPassing, "test-output")

if mock.updates["foo"] != 1 {
t.Fatalf("should have 1 updates %v", mock.updates)
Expand All @@ -176,6 +177,10 @@ func TestCheckTTL(t *testing.T) {
if mock.state["foo"] != structs.HealthCritical {
t.Fatalf("should be critical %v", mock.state)
}

if !strings.Contains(mock.output["foo"], "test-output") {
t.Fatalf("should have retained output %v", mock.output)
}
}

func mockHTTPServer(responseCode int) *httptest.Server {
Expand Down
1 change: 1 addition & 0 deletions command/agent/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass))
s.mux.HandleFunc("/v1/agent/check/warn/", s.wrap(s.AgentCheckWarn))
s.mux.HandleFunc("/v1/agent/check/fail/", s.wrap(s.AgentCheckFail))
s.mux.HandleFunc("/v1/agent/check/update/", s.wrap(s.AgentCheckUpdate))

s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService))
s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService))
Expand Down
46 changes: 37 additions & 9 deletions website/source/docs/agent/http/agent.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ The following endpoints are supported:
* [`/v1/agent/force-leave/<node>`](#agent_force_leave)>: Forces removal of a node
* [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check
* [`/v1/agent/check/deregister/<checkID>`](#agent_check_deregister) : Deregisters a local check
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local test as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local test as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local test as critical
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local check as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local check as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local check as critical
* [`/v1/agent/check/update/<checkID>`](#agent_check_update) : Updates a local check
* [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service
* [`/v1/agent/service/deregister/<serviceID>`](#agent_service_deregister) : Deregisters a local service
* [`/v1/agent/service/maintenance/<serviceID>`](#agent_service_maintenance) : Manages service maintenance mode
Expand Down Expand Up @@ -310,8 +311,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `passing`
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

Expand All @@ -321,8 +323,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `warning`,
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

Expand All @@ -332,8 +335,33 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `critical`,
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

### <a name="agent_check_update"></a> /v1/agent/check/update/\<checkId\>

This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.html).
When this endpoint is accessed with a PUT, the status and output of the check are
updated and the TTL clock is reset.

This endpoint expects a JSON request body to be put. The request body must look like:

```javascript
{
"Status": "passing",
"Output": "curl reported a failure:\n\n..."
}
```

The `Status` field is mandatory, and must be set to "passing", "warning", or "critical".

`Output` is an optional field that will associate a human-readable message with the status
of the check, such as the output of the checking script or process. This will be truncated
if it exceeds 4KB in size. This will be passed through to the check's `Output` field in the
check endpoints.

The return code is 200 on success.

Expand Down

0 comments on commit ad13b34

Please sign in to comment.