From e40b09667ccde833c433d0a583013a65a95ecfdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christina=20S=C3=B8rensen?= Date: Mon, 6 Jan 2025 09:05:11 +0100 Subject: [PATCH] feat: group memory.stats sock metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the cgroup stat `sock` from the `memory.stats` metric to cAdvisor. The motivation is that we've seen numerous examples at DBC Digital of application developers creating applications that exhaust socket memory, e.g. by accidentally creating too many TCP connections and not closing them, or keeping around a few large allocations, or many other such issues. Because cAdvisor currently doesn't report socket memory usage, this has been hard to monitor, and will only be observed when the OOM killer is reached. By adding this metric, it will be possible to proactively handle socket memory exhaustion (which is really kernel memory exhaustion), before it becomes a potential incident, and to create alerting and enhance observability of this failure mode. Signed-off-by: Christina Sørensen --- cmd/internal/storage/influxdb/influxdb.go | 4 ++++ cmd/internal/storage/influxdb/influxdb_test.go | 2 ++ cmd/internal/storage/statsd/statsd.go | 4 ++++ cmd/internal/storage/stdout/stdout.go | 4 ++++ container/libcontainer/handler.go | 2 ++ info/v1/container.go | 4 ++++ info/v1/test/datagen.go | 1 + metrics/prometheus.go | 7 +++++++ metrics/prometheus_fake.go | 1 + 9 files changed, 29 insertions(+) diff --git a/cmd/internal/storage/influxdb/influxdb.go b/cmd/internal/storage/influxdb/influxdb.go index a651e54a85..695b6b1dca 100644 --- a/cmd/internal/storage/influxdb/influxdb.go +++ b/cmd/internal/storage/influxdb/influxdb.go @@ -68,6 +68,8 @@ const ( serMemorySwap string = "memory_swap" // Size of memory mapped files in bytes serMemoryMappedFile string = "memory_mapped_file" + // Size of socket memory in bytes + serMemorySocket string = "memory_socket" // Working set size serMemoryWorkingSet string = "memory_working_set" // Total active file size @@ -258,6 +260,8 @@ func (s *influxdbStorage) memoryStatsToPoints( points = append(points, makePoint(serMemorySwap, stats.Memory.Swap)) // Size of memory mapped files in bytes points = append(points, makePoint(serMemoryMappedFile, stats.Memory.MappedFile)) + // Size of socket memory in bytes + points = append(points, makePoint(serMemorySocket, stats.Memory.Socket)) // Working Set Size points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet)) // Total Active File Size diff --git a/cmd/internal/storage/influxdb/influxdb_test.go b/cmd/internal/storage/influxdb/influxdb_test.go index 455a23564a..e889beebd7 100644 --- a/cmd/internal/storage/influxdb/influxdb_test.go +++ b/cmd/internal/storage/influxdb/influxdb_test.go @@ -259,6 +259,7 @@ func TestContainerStatsToPoints(t *testing.T) { assertContainsPointWithValue(t, points, serMemoryRss, stats.Memory.RSS) assertContainsPointWithValue(t, points, serMemorySwap, stats.Memory.Swap) assertContainsPointWithValue(t, points, serMemoryMappedFile, stats.Memory.MappedFile) + assertContainsPointWithValue(t, points, serMemorySocket, stats.Memory.Socket) assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage) assertContainsPointWithValue(t, points, serMemoryWorkingSet, stats.Memory.WorkingSet) assertContainsPointWithValue(t, points, serMemoryTotalActiveFile, stats.Memory.TotalActiveFile) @@ -362,6 +363,7 @@ func createTestStats() (*info.ContainerInfo, *info.ContainerStats) { RSS: 18930020352, Swap: 1024, MappedFile: 1025327104, + Socket: 1025327104, WorkingSet: 23630012416, TotalActiveFile: 29459246253, TotalInactiveFile: 28364536434, diff --git a/cmd/internal/storage/statsd/statsd.go b/cmd/internal/storage/statsd/statsd.go index 14f2f03fbc..2057258cc9 100644 --- a/cmd/internal/storage/statsd/statsd.go +++ b/cmd/internal/storage/statsd/statsd.go @@ -55,6 +55,8 @@ const ( serMemorySwap string = "memory_swap" // Size of memory mapped files in bytes serMemoryMappedFile string = "memory_mapped_file" + // Size of socket memory in bytes + serMemorySocket string = "memory_socket" // Working set size serMemoryWorkingSet string = "memory_working_set" // Total active file size @@ -161,6 +163,8 @@ func (s *statsdStorage) memoryStatsToValues(series *map[string]uint64, stats *in (*series)[serMemorySwap] = stats.Memory.Swap // Size of memory mapped files in bytes (*series)[serMemoryMappedFile] = stats.Memory.MappedFile + // Size of socket memory in bytes. + (*series)[serMemorySocket] = stats.Memory.Socket // Working Set Size (*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet // Total Active File Size diff --git a/cmd/internal/storage/stdout/stdout.go b/cmd/internal/storage/stdout/stdout.go index 15aa61d793..56bfd904b5 100644 --- a/cmd/internal/storage/stdout/stdout.go +++ b/cmd/internal/storage/stdout/stdout.go @@ -57,6 +57,8 @@ const ( serMemorySwap string = "memory_swap" // Size of memory mapped files in bytes serMemoryMappedFile string = "memory_mapped_file" + // Size of socket memory in bytes + serMemorySocket string = "memory_socket" // Working set size serMemoryWorkingSet string = "memory_working_set" // Total active file @@ -166,6 +168,8 @@ func (driver *stdoutStorage) memoryStatsToValues(series *map[string]uint64, stat (*series)[serMemorySwap] = stats.Memory.Swap // Size of memory mapped files in bytes (*series)[serMemoryMappedFile] = stats.Memory.MappedFile + // Size of socket memory in bytes + (*series)[serMemorySocket] = stats.Memory.Socket // Working Set Size (*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet // Total Active File diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index 9f05a52a49..7e21456ee3 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -799,6 +799,8 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage + ret.Memory.Socket = s.MemoryStats.Stats["sock"] + fmt.Println(s.MemoryStats.Stats) if cgroups.IsCgroup2UnifiedMode() { ret.Memory.Cache = s.MemoryStats.Stats["file"] diff --git a/info/v1/container.go b/info/v1/container.go index ae1d9caecc..133ae3bded 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -390,6 +390,10 @@ type MemoryStats struct { // The amount of memory used for mapped files (includes tmpfs/shmem) MappedFile uint64 `json:"mapped_file"` + // The amount of kernel memory used by the sockets spawned by the process. + // Units: Bytes. + Socket uint64 `json:"socket"` + // The amount of working set memory, this includes recently accessed memory, // dirty memory, and kernel memory. Working set is <= "usage". // Units: Bytes. diff --git a/info/v1/test/datagen.go b/info/v1/test/datagen.go index e1478cbaff..d23d0c129a 100644 --- a/info/v1/test/datagen.go +++ b/info/v1/test/datagen.go @@ -47,6 +47,7 @@ func GenerateRandomStats(numStats, numCores int, duration time.Duration) []*info stats.Memory.Cache = uint64(rand.Int63n(4096)) stats.Memory.RSS = uint64(rand.Int63n(4096)) stats.Memory.MappedFile = uint64(rand.Int63n(4096)) + stats.Memory.Socket = uint64(rand.Int63n(4096)) stats.Memory.KernelUsage = uint64(rand.Int63n(4096)) stats.ReferencedMemory = uint64(rand.Int63n(1000)) ret[i] = stats diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 86064819d3..1975d9fbbc 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -398,6 +398,13 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri getValues: func(s *info.ContainerStats) metricValues { return metricValues{{value: float64(s.Memory.MappedFile), timestamp: s.Timestamp}} }, + }, { + name: "container_memory_socket", + help: "Size of kernel memory allocated by sockets.", + valueType: prometheus.GaugeValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Memory.Socket), timestamp: s.Timestamp}} + }, }, { name: "container_memory_swap", help: "Container swap usage in bytes.", diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index fd43b78148..2b9c268311 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -356,6 +356,7 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req Cache: 14, RSS: 15, MappedFile: 16, + Socket: 16, KernelUsage: 17, Swap: 8192, },