diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index 33d661e3da..03ba2de89e 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -110,18 +110,26 @@ def parse_ts_dump_name(self, ts_stdout): syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name") return "" - def invoke_ts_cmd(self, since_cfg): - since_cfg = "'" + since_cfg + "'" + def invoke_ts_cmd(self, since_cfg, num_retry=0): cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] cmd = " ".join(cmd_opts) rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) - if rc: + new_dump = "" + if rc == EXT_LOCKFAIL: + syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr)) + elif rc == EXT_RETRY: + if num_retry <= MAX_RETRY_LIMIT: + return self.invoke_ts_cmd(since_cfg, num_retry+1) + else: + syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) + elif rc != EXT_SUCCESS: syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr)) - new_dump = self.parse_ts_dump_name(stdout) - if not new_dump: - syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) - else: - syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) + else: # EXT_SUCCESS + new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name + if not new_dump: + syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) + else: + syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) return new_dump def verify_rate_limit_intervals(self, global_cooloff, container_cooloff): diff --git a/scripts/generate_dump b/scripts/generate_dump index ef06e929e1..85a7f6a057 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -63,6 +63,10 @@ handle_exit() ECODE=$? echo "Removing lock. Exit: $ECODE" >&2 $RM $V -rf ${LOCKDIR} + # Echo the filename as the last statement if the generation succeeds + if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then + echo $TARFILE + fi } handle_signal() @@ -1360,8 +1364,6 @@ main() { # Invoke the TechSupport Cleanup Hook setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log & - - echo ${TARFILE} if ! $SAVE_STDERR then @@ -1494,7 +1496,6 @@ while getopts ":xnvhzas:t:r:d" opt; do ;; v) # echo commands about to be run to stderr - set -v V="-v" ;; n) @@ -1547,7 +1548,7 @@ fi ## Attempt Locking ## -if mkdir "${LOCKDIR}" &>/dev/null; then +if $MKDIR "${LOCKDIR}" &>/dev/null; then trap 'handle_exit' EXIT echo "$$" > "${PIDFILE}" # This handler will exit the script upon receiving these interrupts @@ -1555,6 +1556,9 @@ if mkdir "${LOCKDIR}" &>/dev/null; then trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM echo "Lock succesfully accquired and installed signal handlers" # Proceed with the actual code + if [[ ! -z "${V}" ]]; then + set -v + fi main else # lock failed, check if the other PID is alive diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py index e5270460eb..bf4ae8dc78 100644 --- a/tests/coredump_gen_handler_test.py +++ b/tests/coredump_gen_handler_test.py @@ -3,10 +3,12 @@ import sys import pyfakefs import unittest +import signal from pyfakefs.fake_filesystem_unittest import Patcher from swsscommon import swsscommon from utilities_common.general import load_module_from_source from utilities_common.db import Db +from utilities_common.auto_techsupport_helper import EXT_RETRY from .mock_tables import dbconnector sys.path.append("scripts") @@ -18,6 +20,9 @@ /tmp/saisdkdump """ +def signal_handler(signum, frame): + raise Exception("Timed out!") + def set_auto_ts_cfg(redis_mock, state="disabled", rate_limit_interval="0", max_core_size="0.0", @@ -264,7 +269,7 @@ def test_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since '4 days ago'" in cmd_str: + if "--since 4 days ago" in cmd_str: patcher.fs.create_file(ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" elif "date --date=4 days ago" in cmd_str: @@ -330,7 +335,7 @@ def test_invalid_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since '2 days ago'" in cmd_str: + if "--since 2 days ago" in cmd_str: patcher.fs.create_file(ts_dump) print(AUTO_TS_STDOUT + ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" @@ -396,3 +401,30 @@ def mock_cmd(cmd, env): assert "orchagent.12345.123.core.gz" in current_fs assert "lldpmgrd.12345.22.core.gz" in current_fs assert "python3.12345.21.core.gz" in current_fs + + def test_max_retry_ts_failure(self): + """ + Scenario: TS subprocess is continously returning EXT_RETRY + Make sure auto-ts is not exceeding the limit + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="enabled") + with Patcher() as patcher: + def mock_cmd(cmd, env): + return EXT_RETRY, "", "" + + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + + signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(5) # 5 seconds + try: + cls.handle_core_dump_creation_event() + except Exception: + assert False, "Method should not time out" + finally: + signal.alarm(0) + diff --git a/utilities_common/auto_techsupport_helper.py b/utilities_common/auto_techsupport_helper.py index 0fba656521..4eaae933b0 100644 --- a/utilities_common/auto_techsupport_helper.py +++ b/utilities_common/auto_techsupport_helper.py @@ -11,8 +11,9 @@ "CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN", "CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF", "CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB", - "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", - "TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB" + "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF", + "SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY", + "EXT_SUCCESS", "MAX_RETRY_LIMIT" ] + [ # Methods "verify_recent_file_creation", "get_ts_dumps", @@ -60,6 +61,11 @@ TIME_BUF = 20 SINCE_DEFAULT = "2 days ago" +# Techsupport Exit Codes +EXT_LOCKFAIL = 2 +EXT_RETRY = 4 +EXT_SUCCESS = 0 +MAX_RETRY_LIMIT = 2 # Helper methods def subprocess_exec(cmd, env=None):