Skip to content

Commit

Permalink
Improve health check for Fleet Server (#1824)
Browse files Browse the repository at this point in the history
Fleet Server restarts once after being healthy. Agents enrolling during
this restart fail and need to be restarted. Improve the healthcheck so
it expects to have three healthy healthchecks before considering Fleet
Server ready to enroll.
  • Loading branch information
jsoriano authored May 8, 2024
1 parent e53417a commit db4c611
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 7 deletions.
7 changes: 4 additions & 3 deletions internal/stack/_static/docker-compose-stack.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ services:
kibana:
condition: service_healthy
healthcheck:
test: "curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null"
test: "bash /healthcheck.sh"
start_period: 60s
interval: 5s
hostname: docker-fleet-server
Expand All @@ -112,8 +112,9 @@ services:
- "KIBANA_FLEET_SETUP=1"
- "KIBANA_HOST={{ fact "kibana_host" }}"
volumes:
- "../certs/ca-cert.pem:/etc/ssl/certs/elastic-package.pem"
- "../certs/fleet-server:/etc/ssl/elastic-agent"
- "../certs/ca-cert.pem:/etc/ssl/certs/elastic-package.pem:ro"
- "../certs/fleet-server:/etc/ssl/elastic-agent:ro"
- "./fleet-server-healthcheck.sh:/healthcheck.sh:ro"
ports:
- "127.0.0.1:8220:8220"
{{ if eq $apm_enabled "true" }}
Expand Down
17 changes: 17 additions & 0 deletions internal/stack/_static/fleet-server-healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -e -o pipefail

healthcheck() {
curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null
}

# Fleet Server can restart after announcing to be healthy, agents connecting during this restart will
# fail to enroll. Expect 3 healthy healthchecks before considering it healthy.
expected=3
for i in $(seq $expected); do
healthcheck
sleep 1
done

exit 0
10 changes: 7 additions & 3 deletions internal/stack/boot.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,14 @@ func BootUp(ctx context.Context, options Options) error {
// As a workaround, try to give another chance to docker-compose if only
// elastic-agent failed.
if onlyElasticAgentFailed(ctx, options) && !errors.Is(err, context.Canceled) {
sleepTime := 10 * time.Second
sleepTime := 2 * time.Second
fmt.Printf("Elastic Agent failed to start, trying again in %s.\n", sleepTime)
time.Sleep(sleepTime)
err = dockerComposeUp(ctx, options)
select {
case <-time.After(sleepTime):
err = dockerComposeUp(ctx, options)
case <-ctx.Done():
err = ctx.Err()
}
}
if err != nil {
return fmt.Errorf("running docker-compose failed: %w", err)
Expand Down
9 changes: 8 additions & 1 deletion internal/stack/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ const (
// KibanaHealthcheckFile is the kibana healthcheck.
KibanaHealthcheckFile = "kibana_healthcheck.sh"

// FleetServerHealthcheckFile is the Fleet Server healthcheck.
FleetServerHealthcheckFile = "fleet-server-healthcheck.sh"

// PackageRegistryConfigFile is the config file for the Elastic Package registry
PackageRegistryConfigFile = "package-registry.yml"

Expand Down Expand Up @@ -107,7 +110,11 @@ var (
},
&resource.File{
Path: KibanaHealthcheckFile,
Content: staticSource.Template("_static/kibana_healthcheck.sh.tmpl"),
Content: staticSource.Template("_static/kibana-healthcheck.sh.tmpl"),
},
&resource.File{
Path: FleetServerHealthcheckFile,
Content: staticSource.File("_static/fleet-server-healthcheck.sh"),
},
&resource.File{
Path: PackageRegistryConfigFile,
Expand Down

0 comments on commit db4c611

Please sign in to comment.