diff --git a/.changelog/24556.txt b/.changelog/24556.txt new file mode 100644 index 00000000000..79bb35bf406 --- /dev/null +++ b/.changelog/24556.txt @@ -0,0 +1,3 @@ +```release-note:improvement +client: Emit telemetry from prerun and prestart hooks for monitoring and alerting +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 52467f45656..9b395984545 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -10,9 +10,11 @@ import ( "sync" "time" + "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/allocrunner/tasklifecycle" @@ -44,6 +46,7 @@ import ( // allocRunner is used to run all the tasks in a given allocation type allocRunner struct { + // id is the ID of the allocation. Can be accessed without a lock id string @@ -53,6 +56,15 @@ type allocRunner struct { // clientConfig is the client configuration block. clientConfig *config.Config + // clientBaseLabels are the base metric labels generated by the client. + // These can be used by processes which emit metrics that want to include + // these labels that include node_id, node_class, and node_pool. + // + // They could be generated using the clientConfig, but the kv pairs will + // not change unless the client process is fully restarted, so passing them + // along avoids extra work. + clientBaseLabels []metrics.Label + // stateUpdater is used to emit updated alloc state stateUpdater cinterfaces.AllocStateHandler @@ -87,6 +99,10 @@ type allocRunner struct { // vaultClientFunc is used to get the client used to manage Vault tokens vaultClientFunc vaultclient.VaultClientFunc + // hookStatsHandler is used by certain hooks to emit telemetry data, if the + // operator has not disabled this functionality. + hookStatsHandler interfaces.HookStatsHandler + // waitCh is closed when the Run loop has exited waitCh chan struct{} @@ -230,6 +246,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e id: alloc.ID, alloc: alloc, clientConfig: config.ClientConfig, + clientBaseLabels: config.BaseLabels, consulServicesHandler: config.ConsulServices, consulProxiesClientFunc: config.ConsulProxiesFunc, sidsClient: config.ConsulSI, @@ -269,6 +286,8 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e // Create alloc broadcaster ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) + ar.setHookStatsHandler(alloc.Namespace) + // Create alloc dir ar.allocDir = allocdir.NewAllocDir( ar.logger, @@ -315,6 +334,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { trConfig := &taskrunner.Config{ Alloc: ar.alloc, ClientConfig: ar.clientConfig, + ClientBaseLabels: ar.clientBaseLabels, Task: task, TaskDir: ar.allocDir.NewTaskDir(task), Logger: ar.logger, @@ -1549,3 +1569,23 @@ func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error) } return allocVols.CSIVolumes, nil } + +// setHookStatsHandler builds the hook stats handler based on whether the +// operator has disabled this or not. +// +// The non-noop implementation is built using the base client labels and +// the allocation namespace. The caller will add "hook_name". It would be +// possible to add more labels, however, this would cause the metric +// cardinality to increase dramatically without much benefit. +// +// This could be inline within the NewAllocRunner function, but having a +// separate function makes testing easier. +func (ar *allocRunner) setHookStatsHandler(ns string) { + if ar.clientConfig.DisableAllocationHookMetrics { + ar.hookStatsHandler = hookstats.NewNoOpHandler() + } else { + labels := ar.clientBaseLabels + labels = append(labels, metrics.Label{Name: "namespace", Value: ns}) + ar.hookStatsHandler = hookstats.NewHandler(labels, "alloc_hook") + } +} diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index 21e7aef053f..be8e082d224 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -189,7 +189,17 @@ func (ar *allocRunner) prerun() error { ar.logger.Trace("running pre-run hook", "name", name, "start", start) } - if err := pre.Prerun(); err != nil { + // If the operator has disabled hook metrics, then don't call the time + // function to save 30ns per hook. + var hookExecutionStart time.Time + + if !ar.clientConfig.DisableAllocationHookMetrics { + hookExecutionStart = time.Now() + } + + err := pre.Prerun() + ar.hookStatsHandler.Emit(hookExecutionStart, name, "prerun", err) + if err != nil { return fmt.Errorf("pre-run hook %q failed: %v", name, err) } diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 71cad1c8e6b..591aa596ec5 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -12,15 +12,18 @@ import ( "testing" "time" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/api" multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allochealth" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" arstate "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/allocrunner/tasklifecycle" "github.com/hashicorp/nomad/client/allocrunner/taskrunner" "github.com/hashicorp/nomad/client/allocwatcher" + client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/lib/proclib" "github.com/hashicorp/nomad/client/serviceregistration" regMock "github.com/hashicorp/nomad/client/serviceregistration/mock" @@ -2531,3 +2534,31 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) { calloc = ar.clientAlloc(map[string]*structs.TaskState{}) must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc)) } + +func TestAllocRunner_setHookStatsHandler(t *testing.T) { + ci.Parallel(t) + + // Create an allocation runner that doesn't have any configuration, which + // means the operator has not disabled hook metrics. + baseAllocRunner := &allocRunner{ + clientConfig: &client.Config{}, + clientBaseLabels: []metrics.Label{}, + } + + baseAllocRunner.setHookStatsHandler("platform") + handler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.Handler) + must.True(t, ok) + must.NotNil(t, handler) + + // Create a new allocation runner but explicitly disable hook metrics + // collection. + baseAllocRunner = &allocRunner{ + clientConfig: &client.Config{DisableAllocationHookMetrics: true}, + clientBaseLabels: []metrics.Label{}, + } + + baseAllocRunner.setHookStatsHandler("platform") + noopHandler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.NoOpHandler) + must.True(t, ok) + must.NotNil(t, noopHandler) +} diff --git a/client/allocrunner/hookstats/hookstats.go b/client/allocrunner/hookstats/hookstats.go new file mode 100644 index 00000000000..583c441419c --- /dev/null +++ b/client/allocrunner/hookstats/hookstats.go @@ -0,0 +1,53 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package hookstats + +import ( + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" +) + +// Handler implements interfaces.HookStatsHandler and is used when the operator +// has not disabled hook metrics. +type Handler struct { + baseLabels []metrics.Label + runnerType string +} + +// NewHandler creates a new hook stats handler to be used for emitting hook +// stats for operator alerting and performance identification. The base labels +// should be passed from the client set of labels and the runner type indicates +// if the hooks are run from the alloc or task runner. +func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler { + return &Handler{ + baseLabels: base, + runnerType: runnerType, + } +} + +func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) { + + // Add the hook name to the base labels array, so we have a complete set to + // add to the metrics. Operators do not want this as part of the metric + // name due to cardinality control. + labels := h.baseLabels + labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName}) + + metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels) + if err != nil { + metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels) + } else { + metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels) + } +} + +// NoOpHandler implements interfaces.HookStatsHandler and is used when the +// operator has disabled hook metrics. +type NoOpHandler struct{} + +func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} } + +func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {} diff --git a/client/allocrunner/hookstats/hookstats_test.go b/client/allocrunner/hookstats/hookstats_test.go new file mode 100644 index 00000000000..ecc4d3f5794 --- /dev/null +++ b/client/allocrunner/hookstats/hookstats_test.go @@ -0,0 +1,113 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package hookstats + +import ( + "errors" + "testing" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/ci" + "github.com/shoenig/test/must" +) + +func TestHandler(t *testing.T) { + ci.Parallel(t) + + // Generate base labels that represent what an operator would see and then + // create out new handler to interact with. + baseLabels := []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "none"}, + {Name: "node_pool", Value: "default"}, + {Name: "namespace", Value: "default"}, + {Name: "host", Value: "client-5d3c"}, + {Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"}, + } + newHandler := NewHandler(baseLabels, "test_hook") + + // The data stored is within the in-memory sink as map entries, so we need + // to know the key names to pull this out correctly. Build those now. + var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string + + for _, label := range baseLabels { + metricKeySuffix += ";" + label.Name + "=" + label.Value + } + + metricKeySuffix += ";" + "hook_name=test_hook_name" + sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix + counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix + counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix + + // Create an in-memory sink and global, so we can actually look at and test + // the metrics we emit. + inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Emit hook related metrics where the supplied error is nil and check that + // the data is as expected. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil) + + sinkData := inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName) + must.MapContainsKey(t, sinkData[0].Samples, sampleName) + + successCounter := sinkData[0].Counters[counterSuccessName] + must.Eq(t, 1, successCounter.Count) + must.Eq(t, 1, successCounter.Sum) + + sample1 := sinkData[0].Samples[sampleName] + must.Eq(t, 1, sample1.Count) + must.True(t, sample1.Sum > 0) + + // Create a new in-memory sink and global collector to ensure we don't have + // leftovers from the previous test. + inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Emit a hook related metrics where the supplied error is non-nil and + // check that the data is as expected. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) + + sinkData = inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapContainsKey(t, sinkData[0].Counters, counterFailureName) + must.MapContainsKey(t, sinkData[0].Samples, sampleName) + + failureCounter := sinkData[0].Counters[counterFailureName] + must.Eq(t, 1, failureCounter.Count) + must.Eq(t, 1, failureCounter.Sum) + + sample2 := sinkData[0].Samples[sampleName] + must.Eq(t, 1, sample2.Count) + must.True(t, sample2.Sum > 0) +} + +func TestNoOpHandler(t *testing.T) { + ci.Parallel(t) + + newHandler := NewNoOpHandler() + + // Create a new in-memory sink and global collector, so we can test that no + // metrics are emitted. + inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Call the function with a non-nil error and check the results of the + // in-memory sink. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) + + sinkData := inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapLen(t, 0, sinkData[0].Counters) + must.MapLen(t, 0, sinkData[0].Samples) +} diff --git a/client/allocrunner/interfaces/runner.go b/client/allocrunner/interfaces/runner.go index aabebb7959b..0ff21ea7b2d 100644 --- a/client/allocrunner/interfaces/runner.go +++ b/client/allocrunner/interfaces/runner.go @@ -4,6 +4,8 @@ package interfaces import ( + "time" + "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/pluginmanager/csimanager" @@ -71,3 +73,25 @@ type HookResourceSetter interface { SetCSIMounts(map[string]*csimanager.MountInfo) GetCSIMounts(map[string]*csimanager.MountInfo) } + +// HookStatsHandler defines the interface used to emit metrics for the alloc +// and task runner hooks. +type HookStatsHandler interface { + + // Emit is called once the hook has run, indicating the desired metrics + // should be emitted, if configured. + // + // Args: + // start: The time logged as the hook was triggered. This is used for the + // elapsed time metric. + // + // hookName: The name of the hook that was run, as returned typically by + // the Name() hook function. + // + // hookType: The type of hook such as "prerun" or "postrun". + // + // err: The error returned from the hook execution. The caller should not + // need to check whether this is nil or not before called this function. + // + Emit(start time.Time, hookName, hookType string, err error) +} diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 5d74ab03937..b93b75e3d83 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -17,6 +17,7 @@ import ( multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/hcl/v2/hcldec" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" @@ -213,6 +214,14 @@ type TaskRunner struct { // will have these tags, and optionally more. baseLabels []metrics.Label + // clientBaseLabels are the base metric labels generated by the client. + // These can be used by processes which emit metrics that want to include + // these labels that include node_id, node_class, and node_pool. + // + // These are different to the baseLabels and provide a higher-level view of + // the client functionality when used. + clientBaseLabels []metrics.Label + // logmonHookConfig is used to get the paths to the stdout and stderr fifos // to be passed to the driver for task logging logmonHookConfig *logmonHookConfig @@ -281,6 +290,10 @@ type TaskRunner struct { // users manages the pool of dynamic workload users users dynamic.Pool + // hookStatsHandler is used by certain hooks to emit telemetry data, if the + // operator has not disabled this functionality. + hookStatsHandler interfaces.HookStatsHandler + // pauser controls whether the task should be run or stopped based on a // schedule. (Enterprise) pauser *pauseGate @@ -293,6 +306,9 @@ type Config struct { TaskDir *allocdir.TaskDir Logger log.Logger + // ClientBaseLabels are the base metric labels generated by the client. + ClientBaseLabels []metrics.Label + // ConsulServices is used for managing Consul service registrations ConsulServices serviceregistration.Handler @@ -389,6 +405,7 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) { alloc: config.Alloc, allocID: config.Alloc.ID, clientConfig: config.ClientConfig, + clientBaseLabels: config.ClientBaseLabels, task: config.Task, taskDir: config.TaskDir, taskName: config.Task.Name, @@ -433,6 +450,8 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) { // Create the pauser tr.pauser = newPauseGate(tr) + tr.setHookStatsHandler(config.Alloc.Namespace) + // Pull out the task's resources ares := tr.alloc.AllocatedResources if ares == nil { @@ -1684,3 +1703,23 @@ func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { func (tr *TaskRunner) shutdownDelayCancel() { tr.shutdownDelayCancelFn() } + +// setHookStatsHandler builds the hook stats handler based on whether the +// operator has disabled this or not. +// +// The non-noop implementation is built using the base client labels and +// the allocation namespace. The caller will add "hook_name". It would be +// possible to add more labels, however, this would cause the metric +// cardinality to increase dramatically without much benefit. +// +// This could be inline within the NewTaskRunner function, but having a +// separate function makes testing easier. +func (tr *TaskRunner) setHookStatsHandler(ns string) { + if tr.clientConfig.DisableAllocationHookMetrics { + tr.hookStatsHandler = hookstats.NewNoOpHandler() + } else { + labels := tr.clientBaseLabels + labels = append(labels, metrics.Label{Name: "namespace", Value: ns}) + tr.hookStatsHandler = hookstats.NewHandler(labels, "task_hook") + } +} diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index b0fdcea1a4d..86b1cebd41a 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -282,9 +282,19 @@ func (tr *TaskRunner) prestart() error { tr.logger.Trace("running prestart hook", "name", name, "start", start) } + // If the operator has disabled hook metrics, then don't call the time + // function to save 30ns per hook. + var hookExecutionStart time.Time + + if !tr.clientConfig.DisableAllocationHookMetrics { + hookExecutionStart = time.Now() + } + // Run the prestart hook var resp interfaces.TaskPrestartResponse - if err := pre.Prestart(joinedCtx, &req, &resp); err != nil { + err := pre.Prestart(joinedCtx, &req, &resp) + tr.hookStatsHandler.Emit(hookExecutionStart, name, "prestart", err) + if err != nil { tr.emitHookError(err, name) return structs.WrapRecoverable(fmt.Sprintf("prestart hook %q failed: %v", name, err), err) } diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index ee29756519b..3430e3a3e80 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -18,10 +18,12 @@ import ( "testing" "time" + "github.com/armon/go-metrics" "github.com/golang/snappy" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/getter" "github.com/hashicorp/nomad/client/config" @@ -3099,3 +3101,31 @@ func TestTaskRunner_AllocNetworkStatus(t *testing.T) { }) } } + +func TestTaskRunner_setHookStatsHandler(t *testing.T) { + ci.Parallel(t) + + // Create an task runner that doesn't have any configuration, which means + // the operator has not disabled hook metrics. + baseTaskRunner := &TaskRunner{ + clientConfig: &config.Config{}, + clientBaseLabels: []metrics.Label{}, + } + + baseTaskRunner.setHookStatsHandler("platform") + handler, ok := baseTaskRunner.hookStatsHandler.(*hookstats.Handler) + must.True(t, ok) + must.NotNil(t, handler) + + // Create a new allocation runner but explicitly disable hook metrics + // collection. + baseTaskRunner = &TaskRunner{ + clientConfig: &config.Config{DisableAllocationHookMetrics: true}, + clientBaseLabels: []metrics.Label{}, + } + + baseTaskRunner.setHookStatsHandler("platform") + noopHandler, ok := baseTaskRunner.hookStatsHandler.(*hookstats.NoOpHandler) + must.True(t, ok) + must.NotNil(t, noopHandler) +} diff --git a/client/client.go b/client/client.go index 8d3a0805312..fa539f51ed2 100644 --- a/client/client.go +++ b/client/client.go @@ -621,6 +621,10 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie // Begin syncing allocations to the server c.shutdownGroup.Go(c.allocSync) + // Ensure our base labels are generated and stored before we start the + // client and begin emitting stats. + c.setupStatsLabels() + // Start the client! Don't use the shutdownGroup as run handles // shutdowns manually to prevent updates from being applied during // shutdown. @@ -2783,6 +2787,7 @@ func (c *Client) newAllocRunnerConfig( ) *config.AllocRunnerConfig { return &config.AllocRunnerConfig{ Alloc: alloc, + BaseLabels: c.baseLabels, CSIManager: c.csimanager, CheckStore: c.checkStore, ClientConfig: c.GetConfig(), @@ -3183,22 +3188,33 @@ DISCOLOOP: return nil } -// emitStats collects host resource usage stats periodically -func (c *Client) emitStats() { - // Determining NodeClass to be emitted +// setupStatsLabels builds the base labels for the client. This should be done +// before Client.run() is called, so that sub-system can use these without fear +// they have not been set. +func (c *Client) setupStatsLabels() { + + // Determine the node class label value to emit, so we do not emit an empty + // string if this parameter has not been set by operators. var emittedNodeClass string if emittedNodeClass = c.Node().NodeClass; emittedNodeClass == "" { emittedNodeClass = "none" } - // Assign labels directly before emitting stats so the information expected - // is ready + // Store the base labels, so client and allocrunner subsystem can access + // and use these. + // + // The four labels provide enough useful information for operators in both + // single and multi-tenant clusters while not exploding the cardinality. c.baseLabels = []metrics.Label{ {Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}, {Name: "node_class", Value: emittedNodeClass}, {Name: "node_pool", Value: c.Node().NodePool}, } +} + +// emitStats collects host resource usage stats periodically +func (c *Client) emitStats() { // Start collecting host stats right away and then keep collecting every // collection interval diff --git a/client/client_test.go b/client/client_test.go index 92914004208..40a58033233 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -14,6 +14,7 @@ import ( "testing" "time" + "github.com/armon/go-metrics" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocrunner" @@ -91,30 +92,56 @@ func TestClient_alloc_dirs(t *testing.T) { must.Eq(t, 0o711|fs.ModeDir, fi.Mode()) } -// Certain labels for metrics are dependant on client initial setup. This tests -// that the client has properly initialized before we assign values to labels -func TestClient_BaseLabels(t *testing.T) { +func TestClient_setupStatsLabels(t *testing.T) { ci.Parallel(t) - assert := assert.New(t) - client, cleanup := TestClient(t, nil) - if err := client.Shutdown(); err != nil { - t.Fatalf("err: %v", err) + testCases := []struct { + name string + inputConfig *config.Config + expectedLabels []metrics.Label + }{ + { + name: "empty node class", + inputConfig: &config.Config{ + Node: &structs.Node{ + ID: "f57156f9-19c6-4954-a96e-5abb0b47a8b2", + Datacenter: "dc1", + NodeClass: "", + NodePool: "default", + }, + }, + expectedLabels: []metrics.Label{ + {Name: "node_id", Value: "f57156f9-19c6-4954-a96e-5abb0b47a8b2"}, + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "none"}, + {Name: "node_pool", Value: "default"}, + }, + }, + { + name: "non-empty node class", + inputConfig: &config.Config{ + Node: &structs.Node{ + ID: "f57156f9-19c6-4954-a96e-5abb0b47a8b2", + Datacenter: "dc1", + NodeClass: "high-memory", + NodePool: "default", + }, + }, + expectedLabels: []metrics.Label{ + {Name: "node_id", Value: "f57156f9-19c6-4954-a96e-5abb0b47a8b2"}, + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "high-memory"}, + {Name: "node_pool", Value: "default"}, + }, + }, } - defer cleanup() - // directly invoke this function, as otherwise this will fail on a CI build - // due to a race condition - client.emitStats() - - baseLabels := client.baseLabels - assert.NotEqual(0, len(baseLabels)) - - nodeID := client.Node().ID - for _, e := range baseLabels { - if e.Name == "node_id" { - assert.Equal(nodeID, e.Value) - } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + lightClient := &Client{config: tc.inputConfig} + lightClient.setupStatsLabels() + must.SliceContainsAll(t, lightClient.baseLabels, tc.expectedLabels) + }) } } diff --git a/client/config/arconfig.go b/client/config/arconfig.go index 3ce09f0c997..e2e7f16e7e7 100644 --- a/client/config/arconfig.go +++ b/client/config/arconfig.go @@ -6,6 +6,7 @@ package config import ( "context" + "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" arinterfaces "github.com/hashicorp/nomad/client/allocrunner/interfaces" @@ -47,6 +48,11 @@ type AllocRunnerConfig struct { // Alloc captures the allocation that should be run. Alloc *structs.Allocation + // BaseLabels are the base metric labels generated by the client. These can + // be used by processes which emit metrics that want to include these + // labels that include node_id, node_class, and node_pool. + BaseLabels []metrics.Label + // StateDB is used to store and restore state. StateDB cstate.StateDB diff --git a/client/config/config.go b/client/config/config.go index 4571cc4da12..c6b18bcd08f 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -207,6 +207,10 @@ type Config struct { // allocation metadata as labels in the metrics to remote Telemetry sinks IncludeAllocMetadataInMetrics bool + // DisableAllocationHookMetrics allows operators to disable emitting hook + // metrics. + DisableAllocationHookMetrics bool + // AllowedMetadataKeysInMetrics when provided nomad will only include the // configured metadata keys as part of the metrics to remote Telemetry sinks AllowedMetadataKeysInMetrics []string diff --git a/command/agent/agent.go b/command/agent/agent.go index 48aa503c904..6095e2dce99 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -863,6 +863,7 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics + conf.DisableAllocationHookMetrics = *agentConfig.Telemetry.DisableAllocationHookMetrics // Set the TLS related configs conf.TLSConfig = agentConfig.TLSConfig diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 603cf04f1ad..b53fd2b9634 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -740,6 +740,24 @@ func TestConvertClientConfig(t *testing.T) { }, expectErr: "invalid bridge_network_subnet_ipv6: not an IPv6 address: 10.0.0.1/24", }, + { + name: "hook metrics enabled (default value)", + modConfig: func(c *Config) { + c.Telemetry.DisableAllocationHookMetrics = pointer.Of(false) + }, + assert: func(t *testing.T, cc *clientconfig.Config) { + must.False(t, cc.DisableAllocationHookMetrics) + }, + }, + { + name: "hook metrics disabled", + modConfig: func(c *Config) { + c.Telemetry.DisableAllocationHookMetrics = pointer.Of(true) + }, + assert: func(t *testing.T, cc *clientconfig.Config) { + must.True(t, cc.DisableAllocationHookMetrics) + }, + }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/command/agent/config.go b/command/agent/config.go index 2c8aa514ce0..4f8e41f02c8 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1006,6 +1006,10 @@ type Telemetry struct { // rate is well-controlled but cardinality of requesters is high. DisableRPCRateMetricsLabels bool `hcl:"disable_rpc_rate_metrics_labels"` + // DisableAllocationHookMetrics allows operators to disable emitting hook + // metrics. + DisableAllocationHookMetrics *bool `hcl:"disable_allocation_hook_metrics"` + // Circonus: see https://github.com/circonus-labs/circonus-gometrics // for more details on the various configuration options. // Valid configuration combinations: @@ -1450,12 +1454,13 @@ func DefaultConfig() *Config { }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ - InMemoryCollectionInterval: "10s", - inMemoryCollectionInterval: 10 * time.Second, - InMemoryRetentionPeriod: "1m", - inMemoryRetentionPeriod: 1 * time.Minute, - CollectionInterval: "1s", - collectionInterval: 1 * time.Second, + InMemoryCollectionInterval: "10s", + inMemoryCollectionInterval: 10 * time.Second, + InMemoryRetentionPeriod: "1m", + inMemoryRetentionPeriod: 1 * time.Minute, + CollectionInterval: "1s", + collectionInterval: 1 * time.Second, + DisableAllocationHookMetrics: pointer.Of(false), }, TLSConfig: &config.TLSConfig{}, Sentinel: &config.SentinelConfig{}, @@ -2589,6 +2594,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry { if b.DisableRPCRateMetricsLabels { result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels } + if b.DisableAllocationHookMetrics != nil { + result.DisableAllocationHookMetrics = b.DisableAllocationHookMetrics + } return &result } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 34904ffa89c..c22f74cb15c 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -198,19 +198,20 @@ var basicConfig = &Config{ }, }, Telemetry: &Telemetry{ - StatsiteAddr: "127.0.0.1:1234", - StatsdAddr: "127.0.0.1:2345", - PrometheusMetrics: true, - DisableHostname: true, - UseNodeName: false, - InMemoryCollectionInterval: "1m", - inMemoryCollectionInterval: 1 * time.Minute, - InMemoryRetentionPeriod: "24h", - inMemoryRetentionPeriod: 24 * time.Hour, - CollectionInterval: "3s", - collectionInterval: 3 * time.Second, - PublishAllocationMetrics: true, - PublishNodeMetrics: true, + DisableAllocationHookMetrics: pointer.Of(true), + StatsiteAddr: "127.0.0.1:1234", + StatsdAddr: "127.0.0.1:2345", + PrometheusMetrics: true, + DisableHostname: true, + UseNodeName: false, + InMemoryCollectionInterval: "1m", + inMemoryCollectionInterval: 1 * time.Minute, + InMemoryRetentionPeriod: "24h", + inMemoryRetentionPeriod: 24 * time.Hour, + CollectionInterval: "3s", + collectionInterval: 3 * time.Second, + PublishAllocationMetrics: true, + PublishNodeMetrics: true, }, LeaveOnInt: true, LeaveOnTerm: true, @@ -1139,12 +1140,14 @@ func TestConfig_Telemetry(t *testing.T) { // Ensure we can then overlay user specified data. inputTelemetry2 := &Telemetry{ - inMemoryCollectionInterval: 1 * time.Second, - inMemoryRetentionPeriod: 10 * time.Second, + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + DisableAllocationHookMetrics: pointer.Of(true), } mergedTelemetry2 := mergedTelemetry1.Merge(inputTelemetry2) must.Eq(t, mergedTelemetry2.inMemoryCollectionInterval, 1*time.Second) must.Eq(t, mergedTelemetry2.inMemoryRetentionPeriod, 10*time.Second) + must.True(t, *mergedTelemetry2.DisableAllocationHookMetrics) } func TestConfig_Template(t *testing.T) { diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 4eac3eacf8d..ebc880656a9 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -73,6 +73,7 @@ func TestConfig_Merge(t *testing.T) { DataDogTags: []string{"cat1:tag1", "cat2:tag2"}, PrometheusMetrics: true, DisableHostname: false, + DisableAllocationHookMetrics: pointer.Of(false), CirconusAPIToken: "0", CirconusAPIApp: "nomadic", CirconusAPIURL: "http://api.circonus.com/v2", @@ -279,6 +280,7 @@ func TestConfig_Merge(t *testing.T) { DataDogTags: []string{"cat1:tag1", "cat2:tag2"}, PrometheusMetrics: true, DisableHostname: true, + DisableAllocationHookMetrics: pointer.Of(true), PublishNodeMetrics: true, PublishAllocationMetrics: true, CirconusAPIToken: "1", diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index c8c0e0a3873..5934b971abd 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -201,15 +201,16 @@ audit { } telemetry { - in_memory_collection_interval = "1m" - in_memory_retention_period = "24h" - statsite_address = "127.0.0.1:1234" - statsd_address = "127.0.0.1:2345" - prometheus_metrics = true - disable_hostname = true - collection_interval = "3s" - publish_allocation_metrics = true - publish_node_metrics = true + disable_allocation_hook_metrics = true + in_memory_collection_interval = "1m" + in_memory_retention_period = "24h" + statsite_address = "127.0.0.1:1234" + statsd_address = "127.0.0.1:2345" + prometheus_metrics = true + disable_hostname = true + collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true } leave_on_interrupt = true diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index c5f4702b6a0..f78faf21595 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -369,6 +369,7 @@ "syslog_facility": "LOCAL1", "telemetry": [ { + "disable_allocation_hook_metrics": true, "in_memory_collection_interval": "1m", "in_memory_retention_period": "24h", "collection_interval": "3s", diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index 2899580b6ee..65bffc5b080 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -97,6 +97,9 @@ The following options are available on all telemetry configurations. against Nomad's state store, users with many namespaces and many quotas may want to disable these metrics. +- `disable_allocation_hook_metrics` `(bool: false)` - Specifies if the Nomad + client should publish metrics related to allocation and task hooks. + ### `statsite` These `telemetry` parameters apply to diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index 4ab64495a73..1b769d83b46 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -188,6 +188,20 @@ Nomad will emit [tagged metrics][tagged-metrics], in the below format: | `nomad.client.unallocated.memory` | Total amount of memory free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | | `nomad.client.uptime` | Uptime of the host running the Nomad client | Seconds | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +### Client Hook Metrics + +Nomad will emit metrics allowing you to monitor and alert on allocation and task hook performance. +If you do not need these, they can be disabled via the [`disable_allocation_hook_metrics`][] +configuration parameter. + +| Metric | Description | Unit | Type | Labels | +|-------------------------------------------|-------------------------------------------------------|--------------|---------|-------------------------------------------------------------| +| `nomad.client.alloc_hook.prerun.failed` | Number of hook executions that resulted in an error | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.alloc_hook.prerun.success` | Number of hook executions that completed successfully | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.alloc_hook.prerun.elapsed` | The time it took the hook to run | Milliseconds | Timer | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.failed` | Number of hook executions that resulted in an error | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.success` | Number of hook executions that completed successfully | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.elapsed` | The time it took the hook to run | Milliseconds | Timer | datacenter, host, node_class, node_id, node_pool, hook_name | ## Allocation Metrics @@ -521,3 +535,4 @@ Agent metrics are emitted by all Nomad agents running in either client or server [tagged-metrics]: /nomad/docs/operations/metrics-reference#tagged-metrics [sticky]: /nomad/docs/job-specification/ephemeral_disk#sticky [s_port_plan_failure]: https://developer.hashicorp.com/nomad/s/port-plan-failure +[`disable_allocation_hook_metrics`]: /nomad/docs/configuration/telemetry#disable_allocation_hook_metrics