Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Health checks windows service #13388

Merged
merged 25 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4de260f
docs: added os_service
deblasis Jun 7, 2022
b59c19b
feat: windows service health checks
deblasis Jun 7, 2022
2cd5201
chore: changelog
deblasis Jun 7, 2022
af083cc
tests: added syscall mocking and tests for Check_OSService
deblasis Jun 9, 2022
28f19e4
tests: removed redundant probe test
deblasis Jun 9, 2022
06304bf
lint: conversion
deblasis Jun 9, 2022
ab611ea
Merge remote-tracking branch 'hashicorp/main' into feature/health-che…
deblasis Aug 15, 2022
5dee555
Merge remote-tracking branch 'hashicorp/main' into feature/health-che…
deblasis Aug 15, 2022
61f6fa9
Update website/content/docs/discovery/checks.mdx
deblasis Aug 28, 2022
f0f0421
Update website/content/api-docs/agent/check.mdx
deblasis Aug 28, 2022
f634e36
fix(OSServiceCheck): fixes following code-review
deblasis Aug 28, 2022
f3437ea
Merge remote-tracking branch 'hashicorp/main' into feature/health-che…
deblasis Aug 28, 2022
1d5bb02
docs(checks): updated OSService docs to match new format
deblasis Aug 28, 2022
b391392
Update website/content/docs/discovery/checks.mdx
deblasis Aug 29, 2022
fd2dd89
Update website/content/docs/discovery/checks.mdx
deblasis Aug 29, 2022
c0d647d
fix(agent): removed redundant check on prev. running check
deblasis Aug 29, 2022
26cc56b
fix(agent): removed redundant code in docker check as well
deblasis Aug 29, 2022
60c7c83
Merge remote-tracking branch 'hashicorp/main' into feature/health-che…
deblasis Aug 30, 2022
fc0dd92
fix(agent): uninitialized map panic error
deblasis Sep 21, 2022
f440966
fix(checks): os_service lifecycle bugfix
deblasis Sep 21, 2022
5719fd6
fix(checks): os_service OK message in output
deblasis Sep 21, 2022
461b42e
fix(check): added missing OSService props
deblasis Sep 21, 2022
a39d034
Update website/content/api-docs/agent/check.mdx
deblasis Sep 23, 2022
5bba2a3
Update website/content/docs/ecs/configuration-reference.mdx
deblasis Sep 23, 2022
5f99f57
Update website/content/api-docs/agent/check.mdx
deblasis Oct 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .changelog/13388.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
agent: windows service health check
46 changes: 43 additions & 3 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ type Agent struct {
// checkAliases maps the check ID to an associated Alias checks
checkAliases map[structs.CheckID]*checks.CheckAlias

// checkOSServices maps the check ID to an associated OS Service check
checkOSServices map[structs.CheckID]*checks.CheckOSService

// exposedPorts tracks listener ports for checks exposed through a proxy
exposedPorts map[string]int

Expand All @@ -273,6 +276,9 @@ type Agent struct {
// dockerClient is the client for performing docker health checks.
dockerClient *checks.DockerClient

// osServiceClient is the client for performing OS service checks.
osServiceClient *checks.OSServiceClient

// eventCh is used to receive user events
eventCh chan serf.UserEvent

Expand Down Expand Up @@ -409,6 +415,7 @@ func New(bd BaseDeps) (*Agent, error) {
checkGRPCs: make(map[structs.CheckID]*checks.CheckGRPC),
checkDockers: make(map[structs.CheckID]*checks.CheckDocker),
checkAliases: make(map[structs.CheckID]*checks.CheckAlias),
checkOSServices: make(map[structs.CheckID]*checks.CheckOSService),
eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256),
joinLANNotifier: &systemd.Notifier{},
Expand Down Expand Up @@ -2894,12 +2901,45 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Client: a.dockerClient,
StatusHandler: statusHandler,
}
if prev := a.checkDockers[cid]; prev != nil {
prev.Stop()
}
dockerCheck.Start()
a.checkDockers[cid] = dockerCheck

case chkType.IsOSService():
if existing, ok := a.checkOSServices[cid]; ok {
existing.Stop()
delete(a.checkOSServices, cid)
}
if chkType.Interval < checks.MinInterval {
a.logger.Warn("check has interval below minimum",
"check", cid.String(),
"minimum_interval", checks.MinInterval,
)
chkType.Interval = checks.MinInterval
}

if a.osServiceClient == nil {
ossp, err := checks.NewOSServiceClient()
if err != nil {
a.logger.Error("error creating OS Service client", "error", err)
return err
}
a.logger.Debug("created OS Service client")
a.osServiceClient = ossp
}

osServiceCheck := &checks.CheckOSService{
CheckID: cid,
ServiceID: sid,
OSService: chkType.OSService,
Timeout: chkType.Timeout,
Interval: chkType.Interval,
Logger: a.logger,
Client: a.osServiceClient,
StatusHandler: statusHandler,
}
osServiceCheck.Start()
a.checkOSServices[cid] = osServiceCheck

case chkType.IsMonitor():
if existing, ok := a.checkMonitors[cid]; ok {
existing.Stop()
Expand Down
120 changes: 120 additions & 0 deletions agent/checks/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bufio"
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"io/ioutil"
Expand Down Expand Up @@ -1047,6 +1048,125 @@ func (c *CheckGRPC) Stop() {
}
}

type CheckOSService struct {
CheckID structs.CheckID
ServiceID structs.ServiceID
OSService string
Interval time.Duration
Timeout time.Duration
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see Timeout being used anywhere. Is it worth implementing a timeout error? Unlike gRPC or script checks I would assume OS service polling shouldn't take too long and it's not worth enforcing.

Logger hclog.Logger
StatusHandler *StatusHandler
Client *OSServiceClient

stop bool
stopCh chan struct{}
stopLock sync.Mutex
stopWg sync.WaitGroup
}

func (c *CheckOSService) CheckType() structs.CheckType {
return structs.CheckType{
CheckID: c.CheckID.ID,
OSService: c.OSService,
Interval: c.Interval,
Timeout: c.Timeout,
}
}

func (c *CheckOSService) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
c.stop = false
c.stopCh = make(chan struct{})
c.stopWg.Add(1)
go c.run()
}

func (c *CheckOSService) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}

// Wait for the c.run() goroutine to complete before returning.
c.stopWg.Wait()
}

func (c *CheckOSService) run() {
defer c.stopWg.Done()
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}

func (c *CheckOSService) doCheck() (string, error) {
err := c.Client.Check(c.OSService)
if err == nil {
return api.HealthPassing, nil
}
if errors.Is(err, ErrOSServiceStatusCritical) {
return api.HealthCritical, err
}

return api.HealthWarning, err
}

func (c *CheckOSService) check() {
var out string
var status string
var err error

waitCh := make(chan error, 1)
go func() {
status, err = c.doCheck()
waitCh <- err
}()

timeout := 30 * time.Second
if c.Timeout > 0 {
timeout = c.Timeout
}
select {
case <-time.After(timeout):
msg := fmt.Sprintf("Timed out (%s) running check", timeout.String())
c.Logger.Warn("Timed out running check",
"check", c.CheckID.String(),
"timeout", timeout.String(),
)

c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, msg)

// Now wait for the process to exit so we never start another
// instance concurrently.
<-waitCh
return

case err = <-waitCh:
// The process returned before the timeout, proceed normally
}

out = fmt.Sprintf("Service \"%s\" is healthy", c.OSService)
if err != nil {
c.Logger.Debug("Check failed",
"check", c.CheckID.String(),
"error", err,
)
out = err.Error()
}
c.StatusHandler.updateCheck(c.CheckID, status, out)
}

// StatusHandler keep tracks of successive error/success counts and ensures
// that status can be set to critical/passing only once the successive number of event
// reaches the given threshold.
Expand Down
Loading