Skip to content

Commit

Permalink
client: emit optional telemetry from prerun and prestart hooks (#24657)
Browse files Browse the repository at this point in the history
Co-authored-by: James Rasell <jrasell@users.noreply.github.com>
  • Loading branch information
hc-github-team-nomad-core and jrasell authored Dec 12, 2024
1 parent d778a31 commit 08cbe82
Show file tree
Hide file tree
Showing 23 changed files with 515 additions and 57 deletions.
3 changes: 3 additions & 0 deletions .changelog/24556.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
client: Emit telemetry from prerun and prestart hooks for monitoring and alerting
```
40 changes: 40 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ import (
"sync"
"time"

"github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner/hookstats"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/allocrunner/state"
"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
Expand Down Expand Up @@ -44,6 +46,7 @@ import (

// allocRunner is used to run all the tasks in a given allocation
type allocRunner struct {

// id is the ID of the allocation. Can be accessed without a lock
id string

Expand All @@ -53,6 +56,15 @@ type allocRunner struct {
// clientConfig is the client configuration block.
clientConfig *config.Config

// clientBaseLabels are the base metric labels generated by the client.
// These can be used by processes which emit metrics that want to include
// these labels that include node_id, node_class, and node_pool.
//
// They could be generated using the clientConfig, but the kv pairs will
// not change unless the client process is fully restarted, so passing them
// along avoids extra work.
clientBaseLabels []metrics.Label

// stateUpdater is used to emit updated alloc state
stateUpdater cinterfaces.AllocStateHandler

Expand Down Expand Up @@ -87,6 +99,10 @@ type allocRunner struct {
// vaultClientFunc is used to get the client used to manage Vault tokens
vaultClientFunc vaultclient.VaultClientFunc

// hookStatsHandler is used by certain hooks to emit telemetry data, if the
// operator has not disabled this functionality.
hookStatsHandler interfaces.HookStatsHandler

// waitCh is closed when the Run loop has exited
waitCh chan struct{}

Expand Down Expand Up @@ -230,6 +246,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
id: alloc.ID,
alloc: alloc,
clientConfig: config.ClientConfig,
clientBaseLabels: config.BaseLabels,
consulServicesHandler: config.ConsulServices,
consulProxiesClientFunc: config.ConsulProxiesFunc,
sidsClient: config.ConsulSI,
Expand Down Expand Up @@ -269,6 +286,8 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
// Create alloc broadcaster
ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)

ar.setHookStatsHandler(alloc.Namespace)

// Create alloc dir
ar.allocDir = allocdir.NewAllocDir(
ar.logger,
Expand Down Expand Up @@ -315,6 +334,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
trConfig := &taskrunner.Config{
Alloc: ar.alloc,
ClientConfig: ar.clientConfig,
ClientBaseLabels: ar.clientBaseLabels,
Task: task,
TaskDir: ar.allocDir.NewTaskDir(task),
Logger: ar.logger,
Expand Down Expand Up @@ -1549,3 +1569,23 @@ func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error)
}
return allocVols.CSIVolumes, nil
}

// setHookStatsHandler builds the hook stats handler based on whether the
// operator has disabled this or not.
//
// The non-noop implementation is built using the base client labels and
// the allocation namespace. The caller will add "hook_name". It would be
// possible to add more labels, however, this would cause the metric
// cardinality to increase dramatically without much benefit.
//
// This could be inline within the NewAllocRunner function, but having a
// separate function makes testing easier.
func (ar *allocRunner) setHookStatsHandler(ns string) {
if ar.clientConfig.DisableAllocationHookMetrics {
ar.hookStatsHandler = hookstats.NewNoOpHandler()
} else {
labels := ar.clientBaseLabels
labels = append(labels, metrics.Label{Name: "namespace", Value: ns})
ar.hookStatsHandler = hookstats.NewHandler(labels, "alloc_hook")
}
}
12 changes: 11 additions & 1 deletion client/allocrunner/alloc_runner_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,17 @@ func (ar *allocRunner) prerun() error {
ar.logger.Trace("running pre-run hook", "name", name, "start", start)
}

if err := pre.Prerun(); err != nil {
// If the operator has disabled hook metrics, then don't call the time
// function to save 30ns per hook.
var hookExecutionStart time.Time

if !ar.clientConfig.DisableAllocationHookMetrics {
hookExecutionStart = time.Now()
}

err := pre.Prerun()
ar.hookStatsHandler.Emit(hookExecutionStart, name, "prerun", err)
if err != nil {
return fmt.Errorf("pre-run hook %q failed: %v", name, err)
}

Expand Down
31 changes: 31 additions & 0 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,18 @@ import (
"testing"
"time"

"github.com/armon/go-metrics"
"github.com/hashicorp/consul/api"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allochealth"
"github.com/hashicorp/nomad/client/allocrunner/hookstats"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
arstate "github.com/hashicorp/nomad/client/allocrunner/state"
"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
"github.com/hashicorp/nomad/client/allocwatcher"
client "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/lib/proclib"
"github.com/hashicorp/nomad/client/serviceregistration"
regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
Expand Down Expand Up @@ -2531,3 +2534,31 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) {
calloc = ar.clientAlloc(map[string]*structs.TaskState{})
must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc))
}

func TestAllocRunner_setHookStatsHandler(t *testing.T) {
ci.Parallel(t)

// Create an allocation runner that doesn't have any configuration, which
// means the operator has not disabled hook metrics.
baseAllocRunner := &allocRunner{
clientConfig: &client.Config{},
clientBaseLabels: []metrics.Label{},
}

baseAllocRunner.setHookStatsHandler("platform")
handler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.Handler)
must.True(t, ok)
must.NotNil(t, handler)

// Create a new allocation runner but explicitly disable hook metrics
// collection.
baseAllocRunner = &allocRunner{
clientConfig: &client.Config{DisableAllocationHookMetrics: true},
clientBaseLabels: []metrics.Label{},
}

baseAllocRunner.setHookStatsHandler("platform")
noopHandler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.NoOpHandler)
must.True(t, ok)
must.NotNil(t, noopHandler)
}
53 changes: 53 additions & 0 deletions client/allocrunner/hookstats/hookstats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package hookstats

import (
"time"

"github.com/armon/go-metrics"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
)

// Handler implements interfaces.HookStatsHandler and is used when the operator
// has not disabled hook metrics.
type Handler struct {
baseLabels []metrics.Label
runnerType string
}

// NewHandler creates a new hook stats handler to be used for emitting hook
// stats for operator alerting and performance identification. The base labels
// should be passed from the client set of labels and the runner type indicates
// if the hooks are run from the alloc or task runner.
func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler {
return &Handler{
baseLabels: base,
runnerType: runnerType,
}
}

func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) {

// Add the hook name to the base labels array, so we have a complete set to
// add to the metrics. Operators do not want this as part of the metric
// name due to cardinality control.
labels := h.baseLabels
labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName})

metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels)
if err != nil {
metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels)
} else {
metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels)
}
}

// NoOpHandler implements interfaces.HookStatsHandler and is used when the
// operator has disabled hook metrics.
type NoOpHandler struct{}

func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} }

func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {}
113 changes: 113 additions & 0 deletions client/allocrunner/hookstats/hookstats_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package hookstats

import (
"errors"
"testing"
"time"

"github.com/armon/go-metrics"
"github.com/hashicorp/nomad/ci"
"github.com/shoenig/test/must"
)

func TestHandler(t *testing.T) {
ci.Parallel(t)

// Generate base labels that represent what an operator would see and then
// create out new handler to interact with.
baseLabels := []metrics.Label{
{Name: "datacenter", Value: "dc1"},
{Name: "node_class", Value: "none"},
{Name: "node_pool", Value: "default"},
{Name: "namespace", Value: "default"},
{Name: "host", Value: "client-5d3c"},
{Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"},
}
newHandler := NewHandler(baseLabels, "test_hook")

// The data stored is within the in-memory sink as map entries, so we need
// to know the key names to pull this out correctly. Build those now.
var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string

for _, label := range baseLabels {
metricKeySuffix += ";" + label.Name + "=" + label.Value
}

metricKeySuffix += ";" + "hook_name=test_hook_name"
sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix
counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix
counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix

// Create an in-memory sink and global, so we can actually look at and test
// the metrics we emit.
inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)

_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
must.NoError(t, err)

// Emit hook related metrics where the supplied error is nil and check that
// the data is as expected.
newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil)

sinkData := inMemorySink.Data()
must.Len(t, 1, sinkData)
must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName)
must.MapContainsKey(t, sinkData[0].Samples, sampleName)

successCounter := sinkData[0].Counters[counterSuccessName]
must.Eq(t, 1, successCounter.Count)
must.Eq(t, 1, successCounter.Sum)

sample1 := sinkData[0].Samples[sampleName]
must.Eq(t, 1, sample1.Count)
must.True(t, sample1.Sum > 0)

// Create a new in-memory sink and global collector to ensure we don't have
// leftovers from the previous test.
inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)

_, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
must.NoError(t, err)

// Emit a hook related metrics where the supplied error is non-nil and
// check that the data is as expected.
newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))

sinkData = inMemorySink.Data()
must.Len(t, 1, sinkData)
must.MapContainsKey(t, sinkData[0].Counters, counterFailureName)
must.MapContainsKey(t, sinkData[0].Samples, sampleName)

failureCounter := sinkData[0].Counters[counterFailureName]
must.Eq(t, 1, failureCounter.Count)
must.Eq(t, 1, failureCounter.Sum)

sample2 := sinkData[0].Samples[sampleName]
must.Eq(t, 1, sample2.Count)
must.True(t, sample2.Sum > 0)
}

func TestNoOpHandler(t *testing.T) {
ci.Parallel(t)

newHandler := NewNoOpHandler()

// Create a new in-memory sink and global collector, so we can test that no
// metrics are emitted.
inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)

_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
must.NoError(t, err)

// Call the function with a non-nil error and check the results of the
// in-memory sink.
newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))

sinkData := inMemorySink.Data()
must.Len(t, 1, sinkData)
must.MapLen(t, 0, sinkData[0].Counters)
must.MapLen(t, 0, sinkData[0].Samples)
}
24 changes: 24 additions & 0 deletions client/allocrunner/interfaces/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package interfaces

import (
"time"

"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner/state"
"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
Expand Down Expand Up @@ -71,3 +73,25 @@ type HookResourceSetter interface {
SetCSIMounts(map[string]*csimanager.MountInfo)
GetCSIMounts(map[string]*csimanager.MountInfo)
}

// HookStatsHandler defines the interface used to emit metrics for the alloc
// and task runner hooks.
type HookStatsHandler interface {

// Emit is called once the hook has run, indicating the desired metrics
// should be emitted, if configured.
//
// Args:
// start: The time logged as the hook was triggered. This is used for the
// elapsed time metric.
//
// hookName: The name of the hook that was run, as returned typically by
// the Name() hook function.
//
// hookType: The type of hook such as "prerun" or "postrun".
//
// err: The error returned from the hook execution. The caller should not
// need to check whether this is nil or not before called this function.
//
Emit(start time.Time, hookName, hookType string, err error)
}
Loading

0 comments on commit 08cbe82

Please sign in to comment.