Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds a PUT-based API for TTL checks and retains output on timeouts. #1785

Merged
merged 3 commits into from
Mar 3, 2016
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions command/agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,58 @@ func (s *HTTPServer) AgentCheckFail(resp http.ResponseWriter, req *http.Request)
return nil, nil
}

// checkUpdate is the payload for a PUT to AgentCheckUpdate.
type checkUpdate struct {
// Status us one of the structs.Health* states, "passing", "warning", or
// "critical".
Status string

// Output is the information to post to the UI for operators as the
// output of the process that decided to hit the TTL check. This is
// different from the note field that's associated with the check
// itself.
Output string
}

// AgentCheckUpdate is a PUT-based alternative to the GET-based Pass/Warn/Fail
// APIs.
func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != "PUT" {
resp.WriteHeader(405)
return nil, nil
}

var update checkUpdate
if err := decodeBody(req, &update, nil); err != nil {
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Request decode failed: %v", err)))
return nil, nil
}

switch update.Status {
case structs.HealthPassing:
case structs.HealthWarning:
case structs.HealthCritical:
default:
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Invalid check status: '%s'", update.Status)))
return nil, nil
}

total := len(update.Output)
if total > CheckBufSize {
update.Output = fmt.Sprintf("%s\n...\nCaptured %d of %d bytes",
update.Output[:CheckBufSize], CheckBufSize, total)
}

checkID := strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/")
if err := s.agent.UpdateCheck(checkID, update.Status, update.Output); err != nil {
return nil, err
}
s.syncChanges()
return nil, nil
}

func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args ServiceDefinition
// Fixup the type decode of TTL or Interval if a check if provided
Expand Down
132 changes: 129 additions & 3 deletions command/agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -428,7 +429,6 @@ func TestHTTPAgentPassCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/pass/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand Down Expand Up @@ -461,7 +461,6 @@ func TestHTTPAgentWarnCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/warn/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand Down Expand Up @@ -494,7 +493,6 @@ func TestHTTPAgentFailCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/fail/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
Expand All @@ -515,6 +513,134 @@ func TestHTTPAgentFailCheck(t *testing.T) {
}
}

func TestHTTPAgentUpdateCheck(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()

chk := &structs.HealthCheck{Name: "test", CheckID: "test"}
chkType := &CheckType{TTL: 15 * time.Second}
if err := srv.agent.AddCheck(chk, chkType, false, ""); err != nil {
t.Fatalf("err: %v", err)
}

cases := []checkUpdate{
checkUpdate{"passing", "hello-passing"},
checkUpdate{"critical", "hello-critical"},
checkUpdate{"warning", "hello-warning"},
}

for _, c := range cases {
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
req.Body = encodeReq(c)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}

state := srv.agent.state.Checks()["test"]
if state.Status != c.Status || state.Output != c.Output {
t.Fatalf("bad: %v", state)
}
}

// Make sure abusive levels of output are capped.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "passing",
Output: strings.Repeat("-= bad -=", 5*CheckBufSize),
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}

// Since we append some notes about truncating, we just do a
// rough check that the output buffer was cut down so this test
// isn't super brittle.
state := srv.agent.state.Checks()["test"]
if state.Status != structs.HealthPassing || len(state.Output) > 2*CheckBufSize {
t.Fatalf("bad: %v", state)
}
}

// Check a bogus status.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "itscomplicated",
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
}

// Check a bogus verb.
{
req, err := http.NewRequest("POST", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}

update := checkUpdate{
Status: "passing",
}
req.Body = encodeReq(update)

resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 405 {
t.Fatalf("expected 405, got %d", resp.Code)
}
}
}

func TestHTTPAgentRegisterService(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
Expand Down
24 changes: 23 additions & 1 deletion command/agent/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@ type CheckTTL struct {

timer *time.Timer

lastOutput string
lastOutputLock sync.RWMutex

stop bool
stopCh chan struct{}
stopLock sync.Mutex
Expand Down Expand Up @@ -265,20 +268,39 @@ func (c *CheckTTL) run() {
case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, "TTL expired")
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, c.getExpiredOutput())

case <-c.stopCh:
return
}
}
}

// getExpiredOutput formats the output for the case when the TTL is expired.
func (c *CheckTTL) getExpiredOutput() string {
c.lastOutputLock.RLock()
defer c.lastOutputLock.RUnlock()

const prefix = "TTL expired"
if c.lastOutput == "" {
return fmt.Sprintf("%s (no output was available before timeout)", prefix)
}

return fmt.Sprintf("%s (last output before timeout follows)\n\n%s", prefix, c.lastOutput)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can try to conform to one newline == one log entry, that would be ideal.

}

// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output)

// Store the last output so we can retain it if the TTL expires.
c.lastOutputLock.Lock()
c.lastOutput = output
c.lastOutputLock.Unlock()

c.timer.Reset(c.TTL)
}

Expand Down
7 changes: 6 additions & 1 deletion command/agent/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http/httptest"
"os"
"os/exec"
"strings"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -150,7 +151,7 @@ func TestCheckTTL(t *testing.T) {
defer check.Stop()

time.Sleep(50 * time.Millisecond)
check.SetStatus(structs.HealthPassing, "")
check.SetStatus(structs.HealthPassing, "test-output")

if mock.updates["foo"] != 1 {
t.Fatalf("should have 1 updates %v", mock.updates)
Expand All @@ -176,6 +177,10 @@ func TestCheckTTL(t *testing.T) {
if mock.state["foo"] != structs.HealthCritical {
t.Fatalf("should be critical %v", mock.state)
}

if !strings.Contains(mock.output["foo"], "test-output") {
t.Fatalf("should have retained output %v", mock.output)
}
}

func mockHTTPServer(responseCode int) *httptest.Server {
Expand Down
1 change: 1 addition & 0 deletions command/agent/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass))
s.mux.HandleFunc("/v1/agent/check/warn/", s.wrap(s.AgentCheckWarn))
s.mux.HandleFunc("/v1/agent/check/fail/", s.wrap(s.AgentCheckFail))
s.mux.HandleFunc("/v1/agent/check/update/", s.wrap(s.AgentCheckUpdate))

s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService))
s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService))
Expand Down
46 changes: 37 additions & 9 deletions website/source/docs/agent/http/agent.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ The following endpoints are supported:
* [`/v1/agent/force-leave/<node>`](#agent_force_leave)>: Forces removal of a node
* [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check
* [`/v1/agent/check/deregister/<checkID>`](#agent_check_deregister) : Deregisters a local check
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local test as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local test as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local test as critical
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local check as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local check as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local check as critical
* [`/v1/agent/check/update/<checkID>`](#agent_check_update) : Updates a local check
* [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service
* [`/v1/agent/service/deregister/<serviceID>`](#agent_service_deregister) : Deregisters a local service
* [`/v1/agent/service/maintenance/<serviceID>`](#agent_service_maintenance) : Manages service maintenance mode
Expand Down Expand Up @@ -310,8 +311,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `passing`
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

Expand All @@ -321,8 +323,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `warning`,
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

Expand All @@ -332,8 +335,33 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `critical`,
and the TTL clock is reset.

The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.

The return code is 200 on success.

### <a name="agent_check_update"></a> /v1/agent/check/update/\<checkId\>

This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.html).
When this endpoint is accessed with a PUT, the status and output of the check are
updated and the TTL clock is reset.

This endpoint expects a JSON request body to be put. The request body must look like:

```javascript
{
"Status": "passing",
"Output": "curl reported a failure:\n\n..."
}
```

The `Status` field is mandatory, and must be set to "passing", "warning", or "critical".

`Output` is an optional field that will associate a human-readable message with the status
of the check, such as the output of the checking script or process. This will be truncated
if it exceeds 4KB in size. This will be passed through to the check's `Output` field in the
check endpoints.

The return code is 200 on success.

Expand Down