From 92fceb14c1f6ba290ea73163bff0328aa1ef9958 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Thu, 27 Aug 2020 12:44:21 -0500 Subject: [PATCH] add alert for system.slice rss going over system-reserved --- ...0_90_machine-config-operator_01_prometheus-rules.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml index a0de148657..12e29df3dc 100644 --- a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml +++ b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml @@ -43,3 +43,12 @@ spec: severity: warning annotations: message: "Kubelet health failure threshold reached" + - name: system-memory-exceeds-reservation + rules: + - alert: SystemMemoryExceedsReservation + expr: | + sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.9) + labels: + severity: warning + annotations: + message: "System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 90% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the node. The reservation may be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods."