-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Docker Support for Consul Health Checks #1343
Changes from 8 commits
80ad971
24ed164
71ede8a
83db728
2632bb3
d695012
40f72a8
31cdf4f
809e9f5
5f8f531
4c1818e
b4af7f4
f5f5ed0
423f7fb
9efbd1a
5827865
f0c783d
1e240b5
471442e
2fdcf1a
3d68d06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,12 +6,14 @@ import ( | |
"log" | ||
"net" | ||
"net/http" | ||
"os" | ||
"os/exec" | ||
"sync" | ||
"syscall" | ||
"time" | ||
|
||
"github.com/armon/circbuf" | ||
docker "github.com/fsouza/go-dockerclient" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Surprised there's no official client but didn't see one. There's another by a person that says they work for Docker that's popular. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're using this in Nomad and Terraform. The code and docs are great and it's well-supported. 👍 |
||
"github.com/hashicorp/consul/consul/structs" | ||
"github.com/hashicorp/go-cleanhttp" | ||
) | ||
|
@@ -38,10 +40,12 @@ const ( | |
// Only one of the types needs to be provided | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is now a little stale. |
||
// TTL or Script/Interval or HTTP/Interval or TCP/Interval | ||
type CheckType struct { | ||
Script string | ||
HTTP string | ||
TCP string | ||
Interval time.Duration | ||
Script string | ||
HTTP string | ||
TCP string | ||
Interval time.Duration | ||
DockerContainerId string | ||
Shell string | ||
|
||
Timeout time.Duration | ||
TTL time.Duration | ||
|
@@ -54,7 +58,7 @@ type CheckTypes []*CheckType | |
|
||
// Valid checks if the CheckType is valid | ||
func (c *CheckType) Valid() bool { | ||
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() | ||
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() || c.IsDocker() | ||
} | ||
|
||
// IsTTL checks if this is a TTL type | ||
|
@@ -64,7 +68,7 @@ func (c *CheckType) IsTTL() bool { | |
|
||
// IsMonitor checks if this is a Monitor type | ||
func (c *CheckType) IsMonitor() bool { | ||
return c.Script != "" && c.Interval != 0 | ||
return c.Script != "" && c.DockerContainerId == "" && c.Interval != 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this made me realize how sloppy the current |
||
} | ||
|
||
// IsHTTP checks if this is a HTTP type | ||
|
@@ -77,6 +81,10 @@ func (c *CheckType) IsTCP() bool { | |
return c.TCP != "" && c.Interval != 0 | ||
} | ||
|
||
func (c *CheckType) IsDocker() bool { | ||
return c.DockerContainerId != "" && c.Shell != "" && c.Interval != 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably don't want the shell check here since it's treated as optional down below. |
||
} | ||
|
||
// CheckNotifier interface is used by the CheckMonitor | ||
// to notify when a check has a status update. The update | ||
// should take care to be idempotent. | ||
|
@@ -493,3 +501,134 @@ func (c *CheckTCP) check() { | |
c.Logger.Printf("[DEBUG] agent: check '%v' is passing", c.CheckID) | ||
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP)) | ||
} | ||
|
||
// A custom interface since go-dockerclient doesn't have one | ||
// We will use this interface in our test to inject a fake client | ||
type DockerClient interface { | ||
CreateExec(docker.CreateExecOptions) (*docker.Exec, error) | ||
StartExec(string, docker.StartExecOptions) error | ||
InspectExec(string) (*docker.ExecInspect, error) | ||
} | ||
|
||
// CheckDocker is used to periodically invoke a script to | ||
// determine the health of an application running inside a | ||
// Docker Container. We assume that the script is compatible | ||
// with nagios plugins and expects the output in the same format. | ||
type CheckDocker struct { | ||
Notify CheckNotifier | ||
CheckID string | ||
Script string | ||
DockerContainerId string | ||
Shell string | ||
Interval time.Duration | ||
Logger *log.Logger | ||
|
||
dockerClient DockerClient | ||
cmd []string | ||
stop bool | ||
stopCh chan struct{} | ||
stopLock sync.Mutex | ||
} | ||
|
||
// Start is used to start checks. | ||
// Docker Checks runs until stop is called | ||
func (c *CheckDocker) Start() { | ||
c.stopLock.Lock() | ||
defer c.stopLock.Unlock() | ||
|
||
//create the docker client | ||
if c.dockerClient == nil { | ||
var err error | ||
c.dockerClient, err = docker.NewClientFromEnv() | ||
if err != nil { | ||
c.Logger.Println("[DEBUG] Error creating the Docker Client : %s", err.Error()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might crash later because the client is |
||
} | ||
} | ||
|
||
//figure out the shell | ||
if c.Shell == "" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's probably worth putting this into a helper function and letting normal monitors support the new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will add support for specifying shells in normal monitors in a separate PR. |
||
if otherShell := os.Getenv("SHELL"); otherShell != "" { | ||
c.Shell = otherShell | ||
} else { | ||
c.Shell = "/bin/sh" | ||
} | ||
} | ||
|
||
c.cmd = []string{c.Shell, "-c", c.Script} | ||
|
||
c.stop = false | ||
c.stopCh = make(chan struct{}) | ||
go c.run() | ||
} | ||
|
||
// Stop is used to stop a docker check. | ||
func (c *CheckDocker) Stop() { | ||
c.stopLock.Lock() | ||
defer c.stopLock.Unlock() | ||
if !c.stop { | ||
c.stop = true | ||
close(c.stopCh) | ||
} | ||
} | ||
|
||
// run is invoked by a goroutine to run until Stop() is called | ||
func (c *CheckDocker) run() { | ||
// Get the randomized initial pause time | ||
initialPauseTime := randomStagger(c.Interval) | ||
c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s -c %s in container %s", initialPauseTime, c.Shell, c.Script, c.DockerContainerId) | ||
next := time.After(initialPauseTime) | ||
for { | ||
select { | ||
case <-next: | ||
c.check() | ||
next = time.After(c.Interval) | ||
case <-c.stopCh: | ||
return | ||
} | ||
} | ||
} | ||
|
||
func (c *CheckDocker) check() { | ||
//Set up the Exec since | ||
execOpts := docker.CreateExecOptions{ | ||
AttachStdin: false, | ||
AttachStdout: true, | ||
AttachStderr: true, | ||
Tty: false, | ||
Cmd: c.cmd, | ||
Container: c.DockerContainerId, | ||
} | ||
var ( | ||
exec *docker.Exec | ||
err error | ||
) | ||
if exec, err = c.dockerClient.CreateExec(execOpts); err == nil { | ||
exec = exec | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this do anything since it got assigned above? Could probably drop this clause and change the logic to |
||
} else { | ||
c.Logger.Printf("[DEBUG] agent: Error while creating Exec: %s", err.Error()) | ||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, fmt.Sprintf("Unable to create Exec, error: %s", err.Error())) | ||
return | ||
} | ||
|
||
err = c.dockerClient.StartExec(exec.ID, docker.StartExecOptions{Detach: false, Tty: false}) | ||
if err != nil { | ||
c.Logger.Printf("[DEBUG] Error in executing health checks: %s", err.Error()) | ||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, fmt.Sprintf("Unable to start exec: %s", err.Error())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you capitalized "Exec" in the other error messages. |
||
return | ||
} | ||
|
||
execInfo, err := c.dockerClient.InspectExec(exec.ID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a random curiosity question - is this state the Docker holds onto that we need to clear, or is it something maintained by the client layer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @slackpad This is something I have been wondering too. Not sure when does Docker GC the Execs that users have created. The API doesn't say anything about it. This is how docker client does it too based on https://github.com/docker/docker/blob/master/api/client/exec.go. |
||
if err != nil { | ||
c.Logger.Printf("[DEBUG] Error in inspecting check result : %s", err.Error()) | ||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, fmt.Sprintf("Unable to inspect Exec: %s", err.Error())) | ||
return | ||
} | ||
|
||
if execInfo.ExitCode == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about the check output? A (truncated) version of the check output is normally passed in to |
||
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("Script execution %s: Success", c.Script)) | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An exit code of 1 should set it to warning, not critical. |
||
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", execInfo.ExitCode) | ||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, fmt.Sprintf("Script execution faied with exit code: %s", execInfo.ExitCode)) | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd add a final else that returns an error that the type is unknown.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@slackpad The previous behaviour defaulted to Script check, perhaps we should keep that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that would be ok, too.