Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
monitor process memory consumption and alert for om[is]agent (#2419)
Browse files Browse the repository at this point in the history
  • Loading branch information
xudifsd authored Apr 2, 2019
1 parent e9eb6da commit 1bcb440
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 31 deletions.
29 changes: 25 additions & 4 deletions src/job-exporter/src/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import time
import copy
import os
import collections

from prometheus_client import make_wsgi_app, Counter, Gauge, Histogram
from prometheus_client.core import GaugeMetricFamily
Expand Down Expand Up @@ -84,6 +85,10 @@ def gen_gpu_used_by_zombie_container_counter():
"count of gpu used by zombie container",
labels=["minor_number", "container_id"])

def gen_process_mem_usage_gauge():
return GaugeMetricFamily("process_mem_usage_byte",
"memory usage of process, to save space in prometheus, we only expose those who consume more than 500Mb of memory",
labels=["pid", "cmd"])

class ResourceGauges(object):
def __init__(self):
Expand Down Expand Up @@ -768,15 +773,31 @@ def __init__(self, name, sleep_time, atomic_ref, iteration_counter):
Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter)

def collect_impl(self):
process = ps.get_zombie_process(ProcessCollector.cmd_histogram,
process_info = ps.get_process_info(ProcessCollector.cmd_histogram,
ProcessCollector.cmd_timeout)

if len(process) > 0:
if len(process_info) > 0:
zombie_metrics = gen_zombie_process_counter()
process_mem_metrics = gen_process_mem_usage_gauge()
zombie_count = collections.defaultdict(lambda : 0)

for info in process_info:
if info.state == "D":
if "nvidia-smi" in info.cmd:
# override command name to make alert rule easier
zombie_count["nvidia-smi"] += 1
else:
cmd = info.cmd.split()[0] # remove args
zombie_count[cmd] += 1

if info.rss > 500 * 1024 * 1024:
# only record large memory consumption to save space in prometheus
cmd = info.cmd.split()[0] # remove args
process_mem_metrics.add_metric([str(info.pid), cmd], info.rss)

for cmd, count in process.items():
for cmd, count in zombie_count.items():
zombie_metrics.add_metric([cmd], count)

return [zombie_metrics]
return [zombie_metrics, process_mem_metrics]

return None
35 changes: 20 additions & 15 deletions src/job-exporter/src/ps.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,47 @@

import subprocess
import logging
import collections

import utils

logger = logging.getLogger(__name__)

class ProcessInfo(object):
def __init__(self, pid, state, rss, cmd):
""" pid is string type, rss is a number in byte """
self.pid = pid
self.state = state
self.rss = rss
self.cmd = cmd

def parse_result(ps):
result = collections.defaultdict(lambda : 0)
result = []

for line in ps.split("\n"):
line = line.strip()
if len(line) == 0:
continue
state = line[0]
cmd = line[2:]
if state == "D":
if "nvidia-smi" in cmd:
result["nvidia-smi"] += 1 # override command name to make alert rule easier
else:
cmd = cmd.split()[0] # remove args
result[cmd] += 1
parts = line.split()
state = parts[0]
rss = int(parts[1]) * 1024
pid = parts[2]
cmd = " ".join(parts[3:])
result.append(ProcessInfo(pid, state, rss, cmd))

return result

def get_zombie_process(histogram, timeout):
def get_process_info(histogram, timeout):
try:
ps_output = utils.exec_cmd(["ps", "ax", "--no-headers", "--format", "state,cmd"],
ps_output = utils.exec_cmd(["ps", "ax", "--no-headers", "--format", "state,rss,pid,cmd"],
histogram=histogram, timeout=timeout)

return parse_result(ps_output)
except subprocess.CalledProcessError as e:
logger.exception("command '%s' return with error (code %d): %s",
e.cmd, e.returncode, e.output)
except subprocess.TimeoutExpired:
logger.warning("ps aux timeout")
logger.warning("ps ax timeout")
except Exception:
logger.exception("exec ps aux error")
logger.exception("exec ps ax error")

return None
return []
14 changes: 4 additions & 10 deletions src/job-exporter/test/data/ps_sample.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
D /var/drivers/nvidia/current/bin/nvidia-smi -q -x
S /lib/systemd/systemd --system --deserialize 27
S [kthreadd]
I [kworker/0:0H]
I [mm_percpu_wq]
S [ksoftirqd/0]
I [rcu_sched]
I [rcu_bh]
S [migration/0]
S [watchdog/0]
D 2 4 /var/drivers/nvidia/current/bin/nvidia-smi -q -x
S 1 2 /lib/systemd/systemd --system --deserialize 27
S 2 3 [kthreadd]
I 4 5 [kworker/0:0H]
9 changes: 7 additions & 2 deletions src/job-exporter/test/test_ps.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ def test_parse_ps_result(self):
with open(sample_path, "r") as f:
ps_result = f.read()
parse_result = ps.parse_result(ps_result)
target_result = {"nvidia-smi": 1}
self.assertEqual(target_result, parse_result)

self.assertEqual(4, len(parse_result))
self.assertEqual("D", parse_result[0].state)
self.assertEqual("4", parse_result[0].pid)
self.assertEqual(2 * 1024, parse_result[0].rss)
self.assertEqual("/var/drivers/nvidia/current/bin/nvidia-smi -q -x",
parse_result[0].cmd)

if __name__ == '__main__':
unittest.main()
6 changes: 6 additions & 0 deletions src/prometheus/deploy/alerting/node.rules
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,9 @@ groups:
for: 5m
annotations:
summary: "{{$labels.instance}} is not ready"

- alert: AzureAgentConsumeTooMuchMem
expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G
for: 5m
annotations:
summary: "{{$labels.cmd}} with pid {{$labels.pid}} in {{$labels.instance}} consume more than 1G of memory"

0 comments on commit 1bcb440

Please sign in to comment.