diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index 3680bbe6cf9a..ab3000c899d0 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles + +check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400" + if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry" diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 274019215bec..2bf752fc31d7 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -334,6 +334,10 @@ sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/ sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker +sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker +sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service # Install custom-built openssh sshd diff --git a/files/image_config/monit/memory_checker b/files/image_config/monit/memory_checker new file mode 100755 index 000000000000..4f7912c1e56c --- /dev/null +++ b/files/image_config/monit/memory_checker @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +""" +memory_checker + +This script is part of the feature which will restart the container if memory +usage of it is larger than the threshold value. + +This script is used to check the memory usage of specified cotnainer and +is intended to be run by Monit. It will write an alerting message into +syslog if memory usage of the container is larger than the threshold value for X +times within Y cycles/minutes. Note that if print(...) statement in this script +was executed, the string in it will be appended to Monit syslog messages. + +The following is an example in Monit configuration file to show how Monit will run +this script: + +check program container_memory_ with path "/usr/bin/memory_checker " + if status == 3 for X times within Y cycles exec "/usr/bin/restart_service " +""" + +import argparse +import subprocess +import sys +import syslog +import re + + +def get_command_result(command): + """Executes the command and return the resulting output. + + Args: + command: A string contains the command to be executed. + + Returns: + A string which contains the output of command. + """ + command_stdout = "" + + try: + proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True, universal_newlines=True) + command_stdout, command_stderr = proc_instance.communicate() + if proc_instance.returncode != 0: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'" + .format(command, proc_instance.returncode)) + sys.exit(1) + except (OSError, ValueError) as err: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'" + .format(command, err)) + sys.exit(2) + + return command_stdout.strip() + + +def check_memory_usage(container_name, threshold_value): + """Checks the memory usage of a container and writes an alerting messages into + the syslog if the memory usage is larger than the threshold value. + + Args: + container_name: A string represtents name of a container + threshold_value: An integer indicates the threshold value (Bytes) of memory usage. + + Returns: + None. + """ + command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name) + command_stdout = get_command_result(command) + mem_usage = command_stdout.split("/")[0].strip() + match_obj = re.match(r"\d+\.?\d*", mem_usage) + if match_obj: + mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()]) + mem_usage_unit = mem_usage[match_obj.end():] + + mem_usage_bytes = 0.0 + if mem_usage_unit == "B": + mem_usage_bytes = mem_usage_value + elif mem_usage_unit == "KiB": + mem_usage_bytes = mem_usage_value * 1024 + elif mem_usage_unit == "MiB": + mem_usage_bytes = mem_usage_value * 1024 ** 2 + elif mem_usage_unit == "GiB": + mem_usage_bytes = mem_usage_value * 1024 ** 3 + + if mem_usage_bytes > threshold_value: + print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" + .format(container_name, mem_usage_bytes, threshold_value)) + sys.exit(3) + else: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'" + .format(mem_usage)) + sys.exit(4) + + +def main(): + parser = argparse.ArgumentParser(description="Check memory usage of a container \ + and an alerting message will be written into syslog if memory usage \ + is larger than the threshold value", usage="/usr/bin/memory_checker ") + parser.add_argument("container_name", help="container name") + # TODO: Currently the threshold value is hard coded as a command line argument and will + # remove this in the new version since we want to read this value from 'CONFIG_DB'. + parser.add_argument("threshold_value", type=int, help="threshold value in bytes") + args = parser.parse_args() + + check_memory_usage(args.container_name, args.threshold_value) + + +if __name__ == "__main__": + main() diff --git a/files/image_config/monit/restart_service b/files/image_config/monit/restart_service new file mode 100755 index 000000000000..40da147e9526 --- /dev/null +++ b/files/image_config/monit/restart_service @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +""" +restart_service + +This script is part of the feature which will restart the container if memory +usage of it is larger than the threshold value. + +This script is intended to be run by Monit and is used to restart the specified +container if the memory usage of it is larger than the threshold value for X +times within Y cycles/minutes. + +The following is an example in Monit configuration file to show how Monit will run +this script: + +check program container_memory_ with path "/usr/bin/memory_checker " + if status == 3 for X times within Y cycles exec "/usr/bin/restart_service " +""" + +import argparse +import sys +import syslog +import subprocess + + +def get_command_result(command): + """Executes command and return the exit code, stdout and stderr. + + Args: + command: A string contains the command to be executed. + + Returns: + An integer contains the exit code. + A string contains the output of stdout. + A string contains the output of stderr. + """ + command_stdout = "" + command_stderr = "" + + try: + proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True, universal_newlines=True) + command_stdout, command_stderr = proc_instance.communicate() + if proc_instance.returncode != 0: + return 1, command_stdout.strip(), command_stderr.strip() + except (OSError, ValueError) as err: + return 2, command_stdout.strip(), err + + return 0, command_stdout.strip(), command_stderr.strip() + + +def reset_failed_flag(service_name): + """Reset the failed status of a service. + + Args: + service_name: Name of the service. + + Returns: + None + """ + reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name) + + syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..." + .format(service_name)) + + exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command) + if exit_code == 0: + syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'." + .format(service_name)) + else: + syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}" + .format(service_name, command_stderr)) + + +def restart_service(service_name): + """Reset the failed status of a service and then restart it. + + Args: + service_name: Name of specified service. + + Returns: + None. + """ + restart_command = "sudo systemctl restart {}.service".format(service_name) + + reset_failed_flag(service_name) + + syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name)) + exit_code, command_stdout, command_stderr = get_command_result(restart_command) + if exit_code != 0: + syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}" + .format(service_name, command_stderr)) + + +def main(): + parser = argparse.ArgumentParser(description="Restart a specific service", + usage="/usr/bin/restart_service ") + parser.add_argument("service_name", help="service name") + args = parser.parse_args() + + restart_service(args.service_name) + + +if __name__ == "__main__": + main()