Skip to content

Commit

Permalink
metrics: Add policy_namespace label to per-policy memory metric
Browse files Browse the repository at this point in the history
This fixes a metrics collection error that was occurring in case there were
multiple namespaced policies with the same name. The metrics collector is
iterating over all policies and reports tetragon_tracingpolicy_kernel_memory_bytes
metric for each of them. The metric used to be labeled only with the policy
name, so in case there was a second policy with the same name, the collector
reported a duplicate what causes the entire metrics collection job to fail.
Adding a policy_namespace label should prevent duplicates.

There is an underlying assumption that each policy returned by
ListTracingPolicies has a unique (name, namespace) tuple - if it's not the
case, then metrics collection might fail again. Also, I'm not sure if
per-policy memory metrics should be reported for policies in states different
than loaded. Currently they are - I didn't change that behaviour, but it's now
covered by tests.

Fixes: a5301b8 ("pkg/metrics: add metrics for policy kernel memory us")

Signed-off-by: Anna Kapuscinska <anna@isovalent.com>
  • Loading branch information
lambdanis committed Oct 17, 2024
1 parent 17e6083 commit 80ee2a1
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 8 deletions.
3 changes: 2 additions & 1 deletion docs/content/en/docs/reference/metrics.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions pkg/metrics/policymetrics/policymetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ var policyState = metrics.MustNewCustomGauge(metrics.NewOpts(
var policyKernelMemory = metrics.MustNewCustomGauge(metrics.NewOpts(
consts.MetricsNamespace, "", "tracingpolicy_kernel_memory_bytes",
"The amount of kernel memory in bytes used by policy's sensors non-shared BPF maps (memlock).",
nil, nil, []metrics.UnconstrainedLabel{{Name: "policy", ExampleValue: "example-policy"}},
nil, nil, []metrics.UnconstrainedLabel{
{Name: "policy"},
{Name: "policy_namespace"},
},
))

// This metric collector converts the output of ListTracingPolicies into a few
Expand Down Expand Up @@ -70,7 +73,7 @@ func collect(ch chan<- prometheus.Metric) {
for _, policy := range list.Policies {
state := policy.State
counters[state]++
ch <- policyKernelMemory.MustMetric(float64(policy.KernelMemoryBytes), policy.Name)
ch <- policyKernelMemory.MustMetric(float64(policy.KernelMemoryBytes), policy.Name, policy.Namespace)
}

ch <- policyState.MustMetric(
Expand All @@ -95,5 +98,5 @@ func collectForDocs(ch chan<- prometheus.Metric) {
for _, state := range stateLabel.Values {
ch <- policyState.MustMetric(0, state)
}
ch <- policyKernelMemory.MustMetric(0, "example-policy")
ch <- policyKernelMemory.MustMetric(0, consts.ExamplePolicyLabel, consts.ExampleNamespace)
}
32 changes: 28 additions & 4 deletions pkg/metrics/policymetrics/policymetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ func Test_policyStatusCollector_Collect(t *testing.T) {
expectedMetrics := func(disabled, enabled, err, load_error int) io.Reader {
return strings.NewReader(fmt.Sprintf(`# HELP tetragon_tracingpolicy_kernel_memory_bytes The amount of kernel memory in bytes used by policy's sensors non-shared BPF maps (memlock).
# TYPE tetragon_tracingpolicy_kernel_memory_bytes gauge
tetragon_tracingpolicy_kernel_memory_bytes{policy="pizza"} 0
tetragon_tracingpolicy_kernel_memory_bytes{policy="pizza", policy_namespace=""} 0
tetragon_tracingpolicy_kernel_memory_bytes{policy="amazing-one", policy_namespace=""} 0
tetragon_tracingpolicy_kernel_memory_bytes{policy="amazing-one", policy_namespace="default"} 0
tetragon_tracingpolicy_kernel_memory_bytes{policy="amazing-one", policy_namespace="kube-system"} 0
# HELP tetragon_tracingpolicy_loaded The number of loaded tracing policy by state.
# TYPE tetragon_tracingpolicy_loaded gauge
tetragon_tracingpolicy_loaded{state="disabled"} %d
Expand All @@ -39,7 +42,7 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d
// manager because in the observer tests we only initialize metrics while the observer
// changes for every test (see:
// https://github.com/cilium/tetragon/blob/22eb995b19207ac0ced2dd83950ec8e8aedd122d/pkg/observer/observertesthelper/observer_test_helper.go#L272-L276)
manager := tus.GetTestSensorManager(context.TODO(), t).Manager
manager := tus.GetTestSensorManagerWithDummyPF(context.TODO(), t).Manager
observer.SetSensorManager(manager)
t.Cleanup(observer.ResetSensorManager)

Expand All @@ -52,11 +55,32 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d
},
})
assert.NoError(t, err)
err = testutil.CollectAndCompare(collector, expectedMetrics(0, 1, 0, 0))
// Add three policies with the same name: one clusterwide, two namespaced
err = manager.AddTracingPolicy(context.TODO(), &tracingpolicy.GenericTracingPolicy{
Metadata: v1.ObjectMeta{
Name: "amazing-one",
},
})
assert.NoError(t, err)
err = manager.AddTracingPolicy(context.TODO(), &tracingpolicy.GenericTracingPolicyNamespaced{
Metadata: v1.ObjectMeta{
Name: "amazing-one",
Namespace: "default",
},
})
assert.NoError(t, err)
err = manager.AddTracingPolicy(context.TODO(), &tracingpolicy.GenericTracingPolicyNamespaced{
Metadata: v1.ObjectMeta{
Name: "amazing-one",
Namespace: "kube-system",
},
})
assert.NoError(t, err)
err = testutil.CollectAndCompare(collector, expectedMetrics(0, 4, 0, 0))
assert.NoError(t, err)

err = manager.DisableTracingPolicy(context.TODO(), "pizza", "")
assert.NoError(t, err)
err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0))
err = testutil.CollectAndCompare(collector, expectedMetrics(1, 3, 0, 0))
assert.NoError(t, err)
}

0 comments on commit 80ee2a1

Please sign in to comment.