From 52368c722d93029fbce4c8f25a4f677c3a555cee Mon Sep 17 00:00:00 2001 From: Justin Cichra Date: Wed, 24 Apr 2024 15:48:59 +0000 Subject: [PATCH] feat: add metrics for total_active_file and total_inactive_file memory The goal of this PR is to have additional cAdvisor metrics which expose total_active_file and total_inactive_file. Today working_set_bytes subtracts total_inactive_file in its calculation, but there are situations where exposing these metrics directly is valuable. For example, two containers sharing files in an emptyDir increases total_active_file over time. This is not tracked in the working_set memory. Exposing total_active_file and total_inactive_file to the user allows them to subtract out total_active_file or total_inactive_file if they so choose in their alerts. In the case of prometheus with a thanos sidecar, working_set can give a false sense of high memory usage. The kernel counts thanos reading prometheus written files as "active_file" memory. In that situation, a user may want to exclude active_file from their ContainerLowOnMemory alert. Relates to: kubernetes/kubernetes#43916 --- cmd/internal/storage/bigquery/bigquery.go | 20 ++++++++++++ cmd/internal/storage/influxdb/influxdb.go | 8 +++++ .../storage/influxdb/influxdb_test.go | 32 +++++++++++++------ cmd/internal/storage/statsd/statsd.go | 8 +++++ cmd/internal/storage/stdout/stdout.go | 8 +++++ container/libcontainer/handler.go | 10 ++++++ info/v1/container.go | 8 +++++ info/v2/conversion_test.go | 12 ++++--- integration/tests/api/test_utils.go | 2 ++ metrics/prometheus.go | 16 ++++++++++ metrics/prometheus_fake.go | 8 +++-- metrics/testdata/prometheus_metrics | 6 ++++ .../prometheus_metrics_whitelist_filtered | 6 ++++ 13 files changed, 126 insertions(+), 18 deletions(-) diff --git a/cmd/internal/storage/bigquery/bigquery.go b/cmd/internal/storage/bigquery/bigquery.go index fa5147ab37..6e58f3415d 100644 --- a/cmd/internal/storage/bigquery/bigquery.go +++ b/cmd/internal/storage/bigquery/bigquery.go @@ -50,6 +50,10 @@ const ( colMemoryUsage string = "memory_usage" // Working set size colMemoryWorkingSet string = "memory_working_set" + // Total active file size + colMemoryTotalActiveFile string = "memory_total_active_file" + // Total inactive file size + colMemoryTotalInactiveFile string = "memory_total_inactive_file" // Container page fault colMemoryContainerPgfault string = "memory_container_pgfault" // Constainer major page fault @@ -133,6 +137,16 @@ func (s *bigqueryStorage) GetSchema() *bigquery.TableSchema { Name: colMemoryWorkingSet, } i++ + fields[i] = &bigquery.TableFieldSchema{ + Type: typeInteger, + Name: colMemoryTotalActiveFile, + } + i++ + fields[i] = &bigquery.TableFieldSchema{ + Type: typeInteger, + Name: colMemoryTotalInactiveFile, + } + i++ fields[i] = &bigquery.TableFieldSchema{ Type: typeInteger, Name: colMemoryContainerPgfault, @@ -226,6 +240,12 @@ func (s *bigqueryStorage) containerStatsToRows( // Working set size row[colMemoryWorkingSet] = stats.Memory.WorkingSet + // Total active file size + row[colMemoryTotalActiveFile] = stats.Memory.TotalActiveFile + + // Total inactive file size + row[colMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile + // container page fault row[colMemoryContainerPgfault] = stats.Memory.ContainerData.Pgfault diff --git a/cmd/internal/storage/influxdb/influxdb.go b/cmd/internal/storage/influxdb/influxdb.go index 93bb6b644f..a651e54a85 100644 --- a/cmd/internal/storage/influxdb/influxdb.go +++ b/cmd/internal/storage/influxdb/influxdb.go @@ -70,6 +70,10 @@ const ( serMemoryMappedFile string = "memory_mapped_file" // Working set size serMemoryWorkingSet string = "memory_working_set" + // Total active file size + serMemoryTotalActiveFile string = "memory_total_active_file" + // Total inactive file size + serMemoryTotalInactiveFile string = "memory_total_inactive_file" // Number of memory usage hits limits serMemoryFailcnt string = "memory_failcnt" // Cumulative count of memory allocation failures @@ -256,6 +260,10 @@ func (s *influxdbStorage) memoryStatsToPoints( points = append(points, makePoint(serMemoryMappedFile, stats.Memory.MappedFile)) // Working Set Size points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet)) + // Total Active File Size + points = append(points, makePoint(serMemoryTotalActiveFile, stats.Memory.TotalActiveFile)) + // Total Inactive File Size + points = append(points, makePoint(serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile)) // Number of memory usage hits limits points = append(points, makePoint(serMemoryFailcnt, stats.Memory.Failcnt)) diff --git a/cmd/internal/storage/influxdb/influxdb_test.go b/cmd/internal/storage/influxdb/influxdb_test.go index 69d06817b5..455a23564a 100644 --- a/cmd/internal/storage/influxdb/influxdb_test.go +++ b/cmd/internal/storage/influxdb/influxdb_test.go @@ -75,6 +75,14 @@ func (self *influxDbTestStorageDriver) StatsEq(a, b *info.ContainerStats) bool { return false } + if a.Memory.TotalActiveFile != b.Memory.TotalActiveFile { + return false + } + + if a.Memory.TotalInactiveFile != b.Memory.TotalInactiveFile { + return false + } + if !reflect.DeepEqual(a.Network, b.Network) { return false } @@ -253,6 +261,8 @@ func TestContainerStatsToPoints(t *testing.T) { assertContainsPointWithValue(t, points, serMemoryMappedFile, stats.Memory.MappedFile) assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage) assertContainsPointWithValue(t, points, serMemoryWorkingSet, stats.Memory.WorkingSet) + assertContainsPointWithValue(t, points, serMemoryTotalActiveFile, stats.Memory.TotalActiveFile) + assertContainsPointWithValue(t, points, serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile) assertContainsPointWithValue(t, points, serMemoryFailcnt, stats.Memory.Failcnt) assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgfault) assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgmajfault) @@ -346,16 +356,18 @@ func createTestStats() (*info.ContainerInfo, *info.ContainerStats) { LoadAverage: int32(rand.Intn(1000)), }, Memory: info.MemoryStats{ - Usage: 26767396864, - MaxUsage: 30429605888, - Cache: 7837376512, - RSS: 18930020352, - Swap: 1024, - MappedFile: 1025327104, - WorkingSet: 23630012416, - Failcnt: 1, - ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97}, - HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96}, + Usage: 26767396864, + MaxUsage: 30429605888, + Cache: 7837376512, + RSS: 18930020352, + Swap: 1024, + MappedFile: 1025327104, + WorkingSet: 23630012416, + TotalActiveFile: 29459246253, + TotalInactiveFile: 28364536434, + Failcnt: 1, + ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97}, + HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96}, }, Hugetlb: map[string]info.HugetlbStats{ "1GB": {Usage: 1234, MaxUsage: 5678, Failcnt: 9}, diff --git a/cmd/internal/storage/statsd/statsd.go b/cmd/internal/storage/statsd/statsd.go index 86ee1a335a..14f2f03fbc 100644 --- a/cmd/internal/storage/statsd/statsd.go +++ b/cmd/internal/storage/statsd/statsd.go @@ -57,6 +57,10 @@ const ( serMemoryMappedFile string = "memory_mapped_file" // Working set size serMemoryWorkingSet string = "memory_working_set" + // Total active file size + serMemoryTotalActiveFile string = "memory_total_active_file" + // Total inactive file size + serMemoryTotalInactiveFile string = "memory_total_inactive_file" // Number of memory usage hits limits serMemoryFailcnt string = "memory_failcnt" // Cumulative count of memory allocation failures @@ -159,6 +163,10 @@ func (s *statsdStorage) memoryStatsToValues(series *map[string]uint64, stats *in (*series)[serMemoryMappedFile] = stats.Memory.MappedFile // Working Set Size (*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet + // Total Active File Size + (*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile + // Total Inactive File Size + (*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile // Number of memory usage hits limits (*series)[serMemoryFailcnt] = stats.Memory.Failcnt diff --git a/cmd/internal/storage/stdout/stdout.go b/cmd/internal/storage/stdout/stdout.go index 1c4d31d722..15aa61d793 100644 --- a/cmd/internal/storage/stdout/stdout.go +++ b/cmd/internal/storage/stdout/stdout.go @@ -59,6 +59,10 @@ const ( serMemoryMappedFile string = "memory_mapped_file" // Working set size serMemoryWorkingSet string = "memory_working_set" + // Total active file + serMemoryTotalActiveFile string = "memory_total_active_file" + // Total inactive file + serMemoryTotalInactiveFile string = "memory_total_inactive_file" // Number of memory usage hits limits serMemoryFailcnt string = "memory_failcnt" // Cumulative count of memory allocation failures @@ -164,6 +168,10 @@ func (driver *stdoutStorage) memoryStatsToValues(series *map[string]uint64, stat (*series)[serMemoryMappedFile] = stats.Memory.MappedFile // Working Set Size (*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet + // Total Active File + (*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile + // Total Inactive File + (*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile // Number of memory usage hits limits (*series)[serMemoryFailcnt] = stats.Memory.Failcnt diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index 5b1f84f57f..d8b46b975b 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -834,8 +834,18 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { inactiveFileKeyName = "inactive_file" } + activeFileKeyName := "total_active_file" + if cgroups.IsCgroup2UnifiedMode() { + activeFileKeyName = "active_file" + } + + if v, ok := s.MemoryStats.Stats[activeFileKeyName]; ok { + ret.Memory.TotalActiveFile = v + } + workingSet := ret.Memory.Usage if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok { + ret.Memory.TotalInactiveFile = v if workingSet < v { workingSet = 0 } else { diff --git a/info/v1/container.go b/info/v1/container.go index efcfd5628e..b96a985da8 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -393,6 +393,14 @@ type MemoryStats struct { // Units: Bytes. WorkingSet uint64 `json:"working_set"` + // The total amount of active file memory. + // Units: Bytes. + TotalActiveFile uint64 `json:"total_active_file"` + + // The total amount of inactive file memory. + // Units: Bytes. + TotalInactiveFile uint64 `json:"total_inactive_file"` + Failcnt uint64 `json:"failcnt"` // Size of kernel memory allocated in bytes. diff --git a/info/v2/conversion_test.go b/info/v2/conversion_test.go index c0983316c2..ca470e9480 100644 --- a/info/v2/conversion_test.go +++ b/info/v2/conversion_test.go @@ -137,11 +137,13 @@ func TestContainerStatsFromV1(t *testing.T) { v1Stats := v1.ContainerStats{ Timestamp: timestamp, Memory: v1.MemoryStats{ - Usage: 1, - Cache: 2, - RSS: 3, - WorkingSet: 4, - Failcnt: 5, + Usage: 1, + Cache: 2, + RSS: 3, + WorkingSet: 4, + Failcnt: 5, + TotalActiveFile: 6, + TotalInactiveFile: 7, ContainerData: v1.MemoryStatsMemoryData{ Pgfault: 1, Pgmajfault: 2, diff --git a/integration/tests/api/test_utils.go b/integration/tests/api/test_utils.go index 82899f89d8..cb884b8cbc 100644 --- a/integration/tests/api/test_utils.go +++ b/integration/tests/api/test_utils.go @@ -69,6 +69,8 @@ func checkMemoryStats(t *testing.T, stat info.MemoryStats) { assert.NotEqual(0, stat.Usage, "Memory usage should not be zero") assert.NotEqual(0, stat.WorkingSet, "Memory working set should not be zero") + assert.NotEqual(0, stat.TotalActiveFile, "Memory total active file should not be zero") + assert.NotEqual(0, stat.TotalInactiveFile, "Memory total inactive file should not be zero") if stat.WorkingSet > stat.Usage { t.Errorf("Memory working set (%d) should be at most equal to memory usage (%d)", stat.WorkingSet, stat.Usage) } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index a6fc3dff4c..5dac3e116c 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -431,6 +431,22 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri return metricValues{{value: float64(s.Memory.WorkingSet), timestamp: s.Timestamp}} }, }, + { + name: "container_memory_total_active_file_bytes", + help: "Current total active file in bytes.", + valueType: prometheus.GaugeValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Memory.TotalActiveFile), timestamp: s.Timestamp}} + }, + }, + { + name: "container_memory_total_inactive_file_bytes", + help: "Current total inactive file in bytes.", + valueType: prometheus.GaugeValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Memory.TotalInactiveFile), timestamp: s.Timestamp}} + }, + }, { name: "container_memory_failures_total", help: "Cumulative count of memory allocation failures.", diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 7b7399778d..8ee7685d76 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -329,9 +329,11 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req LoadAverage: 2, }, Memory: info.MemoryStats{ - Usage: 8, - MaxUsage: 8, - WorkingSet: 9, + Usage: 8, + MaxUsage: 8, + WorkingSet: 9, + TotalActiveFile: 7, + TotalInactiveFile: 6, ContainerData: info.MemoryStatsMemoryData{ Pgfault: 10, Pgmajfault: 11, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index 924e58ede2..aa1d01c1d0 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",container_label_foo_label="bar # HELP container_memory_swap Container swap usage in bytes. # TYPE container_memory_swap gauge container_memory_swap{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000 +# HELP container_memory_total_active_file_bytes Current total active file in bytes. +# TYPE container_memory_total_active_file_bytes gauge +container_memory_total_active_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000 +# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes. +# TYPE container_memory_total_inactive_file_bytes gauge +container_memory_total_inactive_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000 # HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed # TYPE container_memory_usage_bytes gauge container_memory_usage_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 1724b3c8ba..e8a1986a77 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",id="testcontainer",image="test # HELP container_memory_swap Container swap usage in bytes. # TYPE container_memory_swap gauge container_memory_swap{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000 +# HELP container_memory_total_active_file_bytes Current total active file in bytes. +# TYPE container_memory_total_active_file_bytes gauge +container_memory_total_active_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000 +# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes. +# TYPE container_memory_total_inactive_file_bytes gauge +container_memory_total_inactive_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000 # HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed # TYPE container_memory_usage_bytes gauge container_memory_usage_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000