elastic · fearful-symmetry · Apr 15, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
@@ -66,6 +66,10 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
 #   http:
 #       # enables http endpoint
 #       enabled: false

@@ -144,7 +144,11 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Exposes agent metrics using http, by default sockets and named pipes are used.
+#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
 #   http:
 #       # enables http endpoint
 #       enabled: false

@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: feature
+
+# Change summary; a 80ish characters long description of the change.
+summary: add-liveness-endpoint
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+description: Adds a liveness endpoint for l8s monitoring
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: monitoring
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+#pr: https://github.com/owner/repo/1234
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
@@ -150,7 +150,11 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Exposes agent metrics using http, by default sockets and named pipes are used.
+#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
 #   http:
 #       # enables http endpoint
 #       enabled: false

@@ -72,6 +72,10 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
 #   http:
 #       # enables http endpoint
 #       enabled: false

@@ -279,6 +279,11 @@ type Coordinator struct {
 
 	// mx         sync.RWMutex
 	// protection protection.Config
+
+	// a sync channel that can be called by other components to check if the main coordinator
+	// loop in runLoopIteration() is active and listening.
+	// Should only be interacted with via CoordinatorActive() or runLoopIteration()
+	heartbeatChan chan struct{}
 }
 
 // The channels Coordinator reads to receive updates from the various managers.
@@ -372,6 +377,7 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
 		logLevelCh:         make(chan logp.Level),
 		overrideStateChan:  make(chan *coordinatorOverrideState),
 		upgradeDetailsChan: make(chan *details.Details),
+		heartbeatChan:      make(chan struct{}),
 	}
 	// Setup communication channels for any non-nil components. This pattern
 	// lets us transparently accept nil managers / simulated events during
@@ -412,6 +418,22 @@ func (c *Coordinator) State() State {
 	return c.stateBroadcaster.Get()
 }
 
+// CoordinatorActive is a blocking method that waits for a channel response
+// from the coordinator loop. This can be used to as a basic health check,
+// as we'll timeout and return false if the coordinator run loop doesn't
+// respond to our channel.
+func (c *Coordinator) CoordinatorActive(timeout time.Duration) bool {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+
+	select {
+	case <-c.heartbeatChan:
+		return true
+	case <-ctx.Done():
+		return false
+	}
+}
+
 func (c *Coordinator) RegisterMonitoringServer(s configReloader) {
 	c.monitoringServerReloader = s
 }
@@ -977,6 +999,8 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 	case upgradeDetails := <-c.upgradeDetailsChan:
 		c.setUpgradeDetails(upgradeDetails)
 
+	case c.heartbeatChan <- struct{}{}:
+
 	case componentState := <-c.managerChans.runtimeManagerUpdate:
 		// New component change reported by the runtime manager via
 		// Coordinator.watchRuntimeComponents(), merge it with the

@@ -14,6 +14,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"net"
 	"testing"
 	"time"
 
@@ -570,7 +571,7 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) {
 	}
 
 	monitoringServer := &fakeMonitoringServer{}
-	newServerFn := func() (reload.ServerController, error) {
+	newServerFn := func(*monitoringCfg.MonitoringConfig) (reload.ServerController, error) {
 		return monitoringServer, nil
 	}
 	monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig())
@@ -1054,3 +1055,7 @@ func (fs *fakeMonitoringServer) Reset() {
 	fs.stopTriggered = false
 	fs.startTriggered = false
 }
+
+func (fs *fakeMonitoringServer) Addr() net.Addr {
+	return nil
+}
@@ -8,6 +8,9 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"time"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
 )
 
 const errTypeUnexpected = "UNEXPECTED"
@@ -16,6 +19,13 @@ type apiError interface {
 	Status() int
 }
 
+// CoordinatorState is used by the HTTP handlers that take a coordinator object.
+// This interface exists to help make testing easier.
+type CoordinatorState interface {
+	State() coordinator.State
+	CoordinatorActive(timeout time.Duration) bool
+}
+
 func createHandler(fn func(w http.ResponseWriter, r *http.Request) error) *apiHandler {
 	return &apiHandler{
 		innerFn: fn,
@@ -30,7 +40,7 @@ type apiHandler struct {
 func (h *apiHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	err := h.innerFn(w, r)
 	if err != nil {
-		switch e := err.(type) { // nolint:errorlint // Will need refactor.
+		switch e := err.(type) { //nolint:errorlint // Will need refactor.
 		case apiError:
 			w.WriteHeader(e.Status())
 		default:

@@ -0,0 +1,40 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package monitoring
+
+import (
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/elastic/elastic-agent-client/v7/pkg/client"
+)
+
+func livenessHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Request) error {
+	return func(w http.ResponseWriter, r *http.Request) error {
+		w.Header().Set("Content-Type", "application/json; charset=utf-8")
+
+		state := coord.State()
+		isUp := coord.CoordinatorActive(time.Second * 10)
+
+		failConfig, err := handleFormValues(r)
+		if err != nil {
+			return fmt.Errorf("error handling form values: %w", err)
+		}
+
+		unhealthyComponent := false
+		for _, comp := range state.Components {
+			if (failConfig.Failed && comp.State.State == client.UnitStateFailed) || (failConfig.Degraded && comp.State.State == client.UnitStateDegraded) {
+				unhealthyComponent = true
+			}
+		}
+		if !isUp {
+			w.WriteHeader(http.StatusServiceUnavailable)
+		} else if unhealthyComponent {
+			w.WriteHeader(http.StatusInternalServerError)
+		}
+		return nil
+	}
+}