Skip to content

Commit

Permalink
[Monit] Restart telemetry container if memory usage is beyond the thr…
Browse files Browse the repository at this point in the history
…eshold (sonic-net#7645)

Signed-off-by: Yong Zhao yozhao@microsoft.com

Why I did it
This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold.

How I did it
I borrowed the system tool Monit to run a script memory_checker which will periodically check the memory usage of streaming telemetry container. If the memory usage of telemetry container is larger than the pre-defined threshold for 10 times during 20 cycles, then an alerting message will be written into syslog and at the same time Monit will run the script restart_service to restart the streaming telemetry container.

How to verify it
I verified this implementation on device str-7260cx3-acs-1.
  • Loading branch information
yozhao101 authored and Carl Keene committed Aug 7, 2021
1 parent c98a934 commit e1e8410
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry

check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"
4 changes: 4 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,10 @@ sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker
sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker
sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service


# Install custom-built openssh sshd
Expand Down
109 changes: 109 additions & 0 deletions files/image_config/monit/memory_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python3

"""
memory_checker
This script is part of the feature which will restart the container if memory
usage of it is larger than the threshold value.
This script is used to check the memory usage of specified cotnainer and
is intended to be run by Monit. It will write an alerting message into
syslog if memory usage of the container is larger than the threshold value for X
times within Y cycles/minutes. Note that if print(...) statement in this script
was executed, the string in it will be appended to Monit syslog messages.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
"""

import argparse
import subprocess
import sys
import syslog
import re


def get_command_result(command):
"""Executes the command and return the resulting output.
Args:
command: A string contains the command to be executed.
Returns:
A string which contains the output of command.
"""
command_stdout = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
.format(command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
sys.exit(2)

return command_stdout.strip()


def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
Args:
container_name: A string represtents name of a container
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
Returns:
None.
"""
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
command_stdout = get_command_result(command)
mem_usage = command_stdout.split("/")[0].strip()
match_obj = re.match(r"\d+\.?\d*", mem_usage)
if match_obj:
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
mem_usage_unit = mem_usage[match_obj.end():]

mem_usage_bytes = 0.0
if mem_usage_unit == "B":
mem_usage_bytes = mem_usage_value
elif mem_usage_unit == "KiB":
mem_usage_bytes = mem_usage_value * 1024
elif mem_usage_unit == "MiB":
mem_usage_bytes = mem_usage_value * 1024 ** 2
elif mem_usage_unit == "GiB":
mem_usage_bytes = mem_usage_value * 1024 ** 3

if mem_usage_bytes > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
.format(mem_usage))
sys.exit(4)


def main():
parser = argparse.ArgumentParser(description="Check memory usage of a container \
and an alerting message will be written into syslog if memory usage \
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
parser.add_argument("container_name", help="container name")
# TODO: Currently the threshold value is hard coded as a command line argument and will
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
args = parser.parse_args()

check_memory_usage(args.container_name, args.threshold_value)


if __name__ == "__main__":
main()
105 changes: 105 additions & 0 deletions files/image_config/monit/restart_service
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3

"""
restart_service
This script is part of the feature which will restart the container if memory
usage of it is larger than the threshold value.
This script is intended to be run by Monit and is used to restart the specified
container if the memory usage of it is larger than the threshold value for X
times within Y cycles/minutes.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
"""

import argparse
import sys
import syslog
import subprocess


def get_command_result(command):
"""Executes command and return the exit code, stdout and stderr.
Args:
command: A string contains the command to be executed.
Returns:
An integer contains the exit code.
A string contains the output of stdout.
A string contains the output of stderr.
"""
command_stdout = ""
command_stderr = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
return 1, command_stdout.strip(), command_stderr.strip()
except (OSError, ValueError) as err:
return 2, command_stdout.strip(), err

return 0, command_stdout.strip(), command_stderr.strip()


def reset_failed_flag(service_name):
"""Reset the failed status of a service.
Args:
service_name: Name of the service.
Returns:
None
"""
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name)

syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..."
.format(service_name))

exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command)
if exit_code == 0:
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'."
.format(service_name))
else:
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}"
.format(service_name, command_stderr))


def restart_service(service_name):
"""Reset the failed status of a service and then restart it.
Args:
service_name: Name of specified service.
Returns:
None.
"""
restart_command = "sudo systemctl restart {}.service".format(service_name)

reset_failed_flag(service_name)

syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name))
exit_code, command_stdout, command_stderr = get_command_result(restart_command)
if exit_code != 0:
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}"
.format(service_name, command_stderr))


def main():
parser = argparse.ArgumentParser(description="Restart a specific service",
usage="/usr/bin/restart_service <service_name>")
parser.add_argument("service_name", help="service name")
args = parser.parse_args()

restart_service(args.service_name)


if __name__ == "__main__":
main()

0 comments on commit e1e8410

Please sign in to comment.