Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Socket check type #1130

Merged
merged 2 commits into from
Jul 27, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ type AgentServiceCheck struct {
Timeout string `json:",omitempty"`
TTL string `json:",omitempty"`
HTTP string `json:",omitempty"`
TCP string `json:",omitempty"`
Status string `json:",omitempty"`
}
type AgentServiceChecks []*AgentServiceCheck
Expand Down
33 changes: 33 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ type Agent struct {
// checkHTTPs maps the check ID to an associated HTTP check
checkHTTPs map[string]*CheckHTTP

// checkTCPs maps the check ID to an associated TCP check
checkTCPs map[string]*CheckTCP

// checkTTLs maps the check ID to an associated check TTL
checkTTLs map[string]*CheckTTL

Expand Down Expand Up @@ -145,6 +148,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
checkMonitors: make(map[string]*CheckMonitor),
checkTTLs: make(map[string]*CheckTTL),
checkHTTPs: make(map[string]*CheckHTTP),
checkTCPs: make(map[string]*CheckTCP),
eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256),
shutdownCh: make(chan struct{}),
Expand Down Expand Up @@ -440,6 +444,10 @@ func (a *Agent) Shutdown() error {
chk.Stop()
}

for _, chk := range a.checkTCPs {
chk.Stop()
}

a.logger.Println("[INFO] agent: requesting shutdown")
var err error
if a.server != nil {
Expand Down Expand Up @@ -801,6 +809,27 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
http.Start()
a.checkHTTPs[check.CheckID] = http

} else if chkType.IsTCP() {
if existing, ok := a.checkTCPs[check.CheckID]; ok {
existing.Stop()
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}

tcp := &CheckTCP{
Notify: &a.state,
CheckID: check.CheckID,
TCP: chkType.TCP,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
}
tcp.Start()
a.checkTCPs[check.CheckID] = tcp

} else {
if existing, ok := a.checkMonitors[check.CheckID]; ok {
existing.Stop()
Expand Down Expand Up @@ -857,6 +886,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
check.Stop()
delete(a.checkHTTPs, checkID)
}
if check, ok := a.checkTCPs[checkID]; ok {
check.Stop()
delete(a.checkTCPs, checkID)
}
if check, ok := a.checkTTLs[checkID]; ok {
check.Stop()
delete(a.checkTTLs, checkID)
Expand Down
98 changes: 94 additions & 4 deletions command/agent/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"io/ioutil"
"log"
"net"
"net/http"
"os/exec"
"sync"
Expand Down Expand Up @@ -31,13 +32,14 @@ const (

// CheckType is used to create either the CheckMonitor
// or the CheckTTL.
// Three types are supported: Script, HTTP, and TTL
// Script and HTTP both require Interval
// Four types are supported: Script, HTTP, TCP and TTL
// Script, HTTP and TCP all require Interval
// Only one of the types needs to be provided
// TTL or Script/Interval or HTTP/Interval
// TTL or Script/Interval or HTTP/Interval or TCP/Interval
type CheckType struct {
Script string
HTTP string
TCP string
Interval time.Duration

Timeout time.Duration
Expand All @@ -51,7 +53,7 @@ type CheckTypes []*CheckType

// Valid checks if the CheckType is valid
func (c *CheckType) Valid() bool {
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP()
}

// IsTTL checks if this is a TTL type
Expand All @@ -69,6 +71,11 @@ func (c *CheckType) IsHTTP() bool {
return c.HTTP != "" && c.Interval != 0
}

// IsTCP checks if this is a TCP type
func (c *CheckType) IsTCP() bool {
return c.TCP != "" && c.Interval != 0
}

// CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update
// should take care to be idempotent.
Expand Down Expand Up @@ -402,3 +409,86 @@ func (c *CheckHTTP) check() {
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, result)
}
}

// CheckTCP is used to periodically make an TCP/UDP connection to
// determine the health of a given check.
// The check is passing if the connection succeeds
// The check is critical if the connection returns an error
type CheckTCP struct {
Notify CheckNotifier
CheckID string
TCP string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger

dialer *net.Dialer
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}

// Start is used to start a TCP check.
// The check runs until stop is called
func (c *CheckTCP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()

if c.dialer == nil {
// Create the socket dialer
c.dialer = &net.Dialer{DualStack: true}

// For long (>10s) interval checks the socket timeout is 10s, otherwise
// the timeout is the interval. This means that a check *should* return
// before the next check begins.
if c.Timeout > 0 && c.Timeout < c.Interval {
c.dialer.Timeout = c.Timeout
} else if c.Interval < 10*time.Second {
c.dialer.Timeout = c.Interval
}
}

c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}

// Stop is used to stop a TCP check.
func (c *CheckTCP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}

// run is invoked by a goroutine to run until Stop() is called
func (c *CheckTCP) run() {
// Get the randomized initial pause time
initialPauseTime := randomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}

// check is invoked periodically to perform the TCP check
func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil {
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
return
}
conn.Close()
c.Logger.Printf("[DEBUG] agent: check '%v' is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
}
72 changes: 72 additions & 0 deletions command/agent/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package agent
import (
"fmt"
"log"
"net"
"net/http"
"net/http/httptest"
"os"
Expand Down Expand Up @@ -321,3 +322,74 @@ func TestCheckHTTP_disablesKeepAlives(t *testing.T) {
t.Fatalf("should have disabled keepalives")
}
}

func mockTCPServer(network string) net.Listener {
var (
addr string
)

if network == `tcp6` {
addr = `[::1]:0`
} else {
addr = `127.0.0.1:0`
}

listener, err := net.Listen(network, addr)
if err != nil {
panic(err)
}

return listener
}

func expectTCPStatus(t *testing.T, tcp string, status string) {
mock := &MockNotify{
state: make(map[string]string),
updates: make(map[string]int),
output: make(map[string]string),
}
check := &CheckTCP{
Notify: mock,
CheckID: "foo",
TCP: tcp,
Interval: 10 * time.Millisecond,
Logger: log.New(os.Stderr, "", log.LstdFlags),
}
check.Start()
defer check.Stop()

time.Sleep(50 * time.Millisecond)

// Should have at least 2 updates
if mock.updates["foo"] < 2 {
t.Fatalf("should have 2 updates %v", mock.updates)
}

if mock.state["foo"] != status {
t.Fatalf("should be %v %v", status, mock.state)
}
}

func TestCheckTCPCritical(t *testing.T) {
var (
tcpServer net.Listener
)

tcpServer = mockTCPServer(`tcp`)
expectTCPStatus(t, `127.0.0.1:0`, "critical")
tcpServer.Close()
}

func TestCheckTCPPassing(t *testing.T) {
var (
tcpServer net.Listener
)

tcpServer = mockTCPServer(`tcp`)
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
tcpServer.Close()

tcpServer = mockTCPServer(`tcp6`)
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
tcpServer.Close()
}
30 changes: 29 additions & 1 deletion website/source/docs/agent/checks.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ There are three different kinds of checks:
It is possible to configure a custom HTTP check timeout value by specifying
the `timeout` field in the check definition.

* TCP + Interval - These checks make an TCP connection attempt every Interval
(e.g. every 30 seconds) to the specified IP/hostname and port. The status of
the service depends on whether the connection attempt is successful (ie - the
port is currently accepting connections). If the connection is accepted, the
status is `success`, otherwise the status is `critical`. In the case of a
hostname that resolves to both IPv4 and IPv6 addresses, an attempt will be
made to both addresses, and the first successful connection attempt will
result in a successful check. This type of check should be preferred over a
script that uses `netcat` or another external process to check a simple socket
operation. By default, TCP checks will be configured with a request timeout
equal to the check interval, with a max of 10 seconds. It is possible to
configure a custom TCP check timeout value by specifying the `timeout` field
in the check definition.

* <a name="TTL"></a>Time to Live (TTL) - These checks retain their last known state for a given TTL.
The state of the check must be updated periodically over the HTTP interface. If an
external system fails to update the status within a given TTL, the check is
Expand Down Expand Up @@ -75,6 +89,20 @@ A HTTP check:
}
```

A TCP check:

```javascript
{
"check": {
"id": "ssh",
"name": "SSH TCP on port 22",
"tcp": "localhost:22",
"interval": "10s",
"timeout": "1s"
}
}
```

A TTL check:

```javascript
Expand Down Expand Up @@ -102,7 +130,7 @@ Checks may also contain a `token` field to provide an ACL token. This token is
used for any interaction with the catalog for the check, including
[anti-entropy syncs](/docs/internals/anti-entropy.html) and deregistration.

Both script and HTTP checks must include an `interval` field. This field is
Script, TCP and HTTP checks must include an `interval` field. This field is
parsed by Go's `time` package, and has the following
[formatting specification](http://golang.org/pkg/time/#ParseDuration):
> A duration string is a possibly signed sequence of decimal numbers, each with
Expand Down
17 changes: 13 additions & 4 deletions website/source/docs/agent/http/agent.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ The endpoint always returns 200.

The register endpoint is used to add a new check to the local agent.
There is more documentation on checks [here](/docs/agent/checks.html).
Checks may be of script, HTTP, or TTL type. The agent is responsible for managing
the status of the check and keeping the Catalog in sync.
Checks may be of script, HTTP, TCP, or TTL type. The agent is responsible for
managing the status of the check and keeping the Catalog in sync.

The register endpoint expects a JSON request body to be PUT. The request
body must look like:
Expand All @@ -237,13 +237,14 @@ body must look like:
"Notes": "Ensure we don't oversubscribe memory",
"Script": "/usr/local/bin/check_mem.py",
"HTTP": "http://example.com",
"TCP": "example.com:22",
"Interval": "10s",
"TTL": "15s"
}
```

The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
`Script` and `HTTP` also require that `Interval` be set.
The `Name` field is mandatory, as is one of `Script`, `HTTP`, `TCP` or `TTL`.
`Script`, `TCP` and `HTTP` also require that `Interval` be set.

If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
`ID` entries per agent, so it may be necessary to provide an `ID`.
Expand All @@ -258,6 +259,14 @@ be a URL) every `Interval`. If the response is any `2xx` code, the check is `pas
If the response is `429 Too Many Requests`, the check is `warning`. Otherwise, the check
is `critical`.

An `TCP` check will perform an TCP connection attempt against the value of `TCP`
(expected to be an IP/hostname and port combination) every `Interval`. If the
connection attempt is successful, the check is `passing`. If the connection
attempt is unsuccessful, the check is `critical`. In the case of a hostname
that resolves to both IPv4 and IPv6 addresses, an attempt will be made to both
addresses, and the first successful connection attempt will result in a
successful check.

If a `TTL` type is used, then the TTL update endpoint must be used periodically to update
the state of the check.

Expand Down
Loading