From a870125b27790830c202c8ab216b30e82c1ed17a Mon Sep 17 00:00:00 2001 From: Daniel Dao Date: Tue, 22 Mar 2022 16:57:52 +0000 Subject: [PATCH] Expose PSI metrics with prometheus This adds support for reading PSI metrics via prometheus. We exposes the following for `psi_total`: ``` container_cpu_psi_total_seconds container_memory_psi_total_seconds container_io_psi_total_seconds ``` And for `psi_avg`: ``` container_cpu_psi_avg10_ratio container_cpu_psi_avg60_ratio container_cpu_psi_avg300_ratio container_memory_psi_avg10_ratio container_memory_psi_avg60_ratio container_memory_psi_avg300_ratio container_io_psi_avg10_ratio container_io_psi_avg60_ratio container_io_psi_avg300_ratio ``` Signed-off-by: Daniel Dao --- metrics/prometheus.go | 77 +++++++++++++++++++ metrics/prometheus_fake.go | 42 ++++++++++ metrics/testdata/prometheus_metrics | 48 ++++++++++++ .../prometheus_metrics_whitelist_filtered | 48 ++++++++++++ 4 files changed, 215 insertions(+) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 04c8d27e8f7..ee3975a177d 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1768,6 +1768,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }) } + if includedMetrics.Has(container.PSITotalMetrics) { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_cpu_psi_total_seconds", + help: "Total time spent under cpu pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.Cpu.PSI, "total") + }, + }, { + name: "container_memory_psi_total_seconds", + help: "Total container time spent under memory pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.Memory.PSI, "total") + }, + }, { + name: "container_io_psi_total_seconds", + help: "Total time spent under io pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.DiskIo.PSI, "total") + }, + }, + }...) + } + + if includedMetrics.Has(container.PSIAvgMetrics) { + makePSIAvgMetric := func(controller, window string) containerMetric { + return containerMetric{ + name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window), + help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window), + valueType: prometheus.GaugeValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + switch controller { + case "cpu": + return getPSIValues(s, &s.Cpu.PSI, "avg"+window) + case "memory": + return getPSIValues(s, &s.Memory.PSI, "avg"+window) + case "io": + return getPSIValues(s, &s.DiskIo.PSI, "avg"+window) + default: + return nil + } + }, + } + } + for _, controller := range []string{"cpu", "memory", "io"} { + for _, window := range []string{"10", "60", "300"} { + c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window)) + } + } + } + return c } @@ -2060,3 +2118,22 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues { } return values } + +func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues { + v := make(metricValues, 0, 2) + switch psiMetric { + case "avg10": + v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}}) + case "avg60": + v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}}) + case "avg300": + v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}}) + case "total": + v = append(v, metricValue{value: float64(psi.Some.Total / 1e9), timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: float64(psi.Full.Total / 1e9), timestamp: s.Timestamp, labels: []string{"full"}}) + } + return v +} diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 822b3f82c97..55ffc5ced1f 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -319,6 +319,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req RunPeriods: 984285, }, LoadAverage: 2, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.1, + Avg60: 0.2, + Avg300: 0.3, + Total: 100, + }, + Full: info.PSIData{ + Avg10: 0.4, + Avg60: 0.5, + Avg300: 0.6, + Total: 200, + }, + }, }, Memory: info.MemoryStats{ Usage: 8, @@ -346,6 +360,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req RSS: 15, MappedFile: 16, Swap: 8192, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.01, + Avg60: 0.02, + Avg300: 0.03, + Total: 1000, + }, + Full: info.PSIData{ + Avg10: 0.04, + Avg60: 0.05, + Avg300: 0.06, + Total: 2000, + }, + }, }, Hugetlb: map[string]info.HugetlbStats{ "2Mi": { @@ -538,6 +566,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req "Write": 6, }, }}, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.11, + Avg60: 0.12, + Avg300: 0.13, + Total: 1111, + }, + Full: info.PSIData{ + Avg10: 0.14, + Avg60: 0.15, + Avg300: 0.16, + Total: 2222, + }, + }, }, Filesystem: []info.FsStats{ { diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index b6db0825f0b..b9fc890e746 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 +# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds +# TYPE container_cpu_psi_avg10_ratio gauge +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 +# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds +# TYPE container_cpu_psi_avg300_ratio gauge +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 +# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds +# TYPE container_cpu_psi_avg60_ratio gauge +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 +# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. +# TYPE container_cpu_psi_total_seconds counter +container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds +# TYPE container_io_psi_avg10_ratio gauge +container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 +container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 +# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds +# TYPE container_io_psi_avg300_ratio gauge +container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 +container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 +# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds +# TYPE container_io_psi_avg60_ratio gauge +container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 +container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 +# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. +# TYPE container_io_psi_total_seconds counter +container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds +# TYPE container_memory_psi_avg10_ratio gauge +container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 +container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 +# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds +# TYPE container_memory_psi_avg300_ratio gauge +container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 +container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 +# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds +# TYPE container_memory_psi_avg60_ratio gauge +container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 +container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 +# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. +# TYPE container_memory_psi_total_seconds counter +container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index a4a01ef9b24..720b50e8e28 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer" # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 +# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds +# TYPE container_cpu_psi_avg10_ratio gauge +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 +# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds +# TYPE container_cpu_psi_avg300_ratio gauge +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 +# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds +# TYPE container_cpu_psi_avg60_ratio gauge +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 +# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. +# TYPE container_cpu_psi_total_seconds counter +container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds +# TYPE container_io_psi_avg10_ratio gauge +container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 +container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 +# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds +# TYPE container_io_psi_avg300_ratio gauge +container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 +container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 +# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds +# TYPE container_io_psi_avg60_ratio gauge +container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 +container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 +# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. +# TYPE container_io_psi_total_seconds counter +container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds +# TYPE container_memory_psi_avg10_ratio gauge +container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 +container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 +# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds +# TYPE container_memory_psi_avg300_ratio gauge +container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 +container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 +# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds +# TYPE container_memory_psi_avg60_ratio gauge +container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 +container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 +# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. +# TYPE container_memory_psi_total_seconds counter +container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000 +container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000