From 8f9cb56406f008cd2fd0cb47fc2112b3cfa1a97d Mon Sep 17 00:00:00 2001
From: yozhao101 <56170650+yozhao101@users.noreply.github.com>
Date: Fri, 30 Oct 2020 17:00:44 -0700
Subject: [PATCH] [pytest] Ignore specific syslog error messages when testing
 autorestart (#2430)

When pytest script was ran to test the autorestart feature, there will be error messages in the syslog which will cause
the logAnalyzer to fail at the end of testing. In order to fix this issue, I examine all the error messages in the syslog
during testing the autorestart feature and add them into whitelist. At the same time, the reason why we should ignore
them are put in the comment of `ignore_expected_loganalyzer_exception(...)` function.

After adding these error messages into whitelist, we need do the post check to see whether all the critical processes
are alive after testing the autorestart feature.

Signed-off-by: Yong Zhao <yozhao@microsoft.com>
---
 .../autorestart/test_container_autorestart.py | 122 ++++++++++++++++--
 1 file changed, 113 insertions(+), 9 deletions(-)

diff --git a/tests/autorestart/test_container_autorestart.py b/tests/autorestart/test_container_autorestart.py
index 67acb1a4e8b..ae3c4dd62b2 100755
--- a/tests/autorestart/test_container_autorestart.py
+++ b/tests/autorestart/test_container_autorestart.py
@@ -21,6 +21,76 @@
 CONTAINER_RESTART_THRESHOLD_SECS = 180
 
 
+@pytest.fixture(autouse=True)
+def ignore_expected_loganalyzer_exception(duthost, loganalyzer):
+    """
+        Ignore expected failure/error messages during testing the autorestart feature.
+
+        First, since we killed a critical process in a specific container to test the feature of
+        autorestart, we expect to see error messages which were fired by Monit such as
+        "ERR monit[563]: 'lldp|lldpd' status failed (1) -- 'lldpd' is not running."
+
+        Second, if teammgrd process was killed for testing, orchagent process in
+        swss container would write the error messages into syslog such as
+        "ERR swss#orchagent: :- removeLag: Failed to remove ref count 1 LAG PortChannel10008."
+        When teamd was restarted, there was an error message in the syslog: "ERR teamd#teamsyncd:
+        :- readData: netlink reports an error=-33 on reading a netlink socket."
+
+        Third, during pmon container was restarted due to ledd process was killed for testing,
+        xcvrd process would write an error message into syslog such as "ERR pmon#xcvrd[29]: :-initialize
+        GlobalConfig: Sonic database config global file doesn't exist at /var/run/redis/sonic-db/database_global.json."
+        thermalctld process would write an error message into syslog such as "ERR pmon#thermalctld[33]:
+        Caught exception while initializing thermal manager."
+
+        Fourth, if orchagent process was killed and swss container was restarted, then syncd process
+        would write error messages such as "ERR syncd#syncd: [none] driverEgressMemoryUpdate:1395
+        Error getting cosq for port 1.". At the same time, syncd process also wrote two WARNING messages
+        into syslog such as "WARNING syncd#syncd:- saiDiscover: skipping since it causes crash:
+        SAI_STP_ATTR_BRIDGE_ID". Since there was a keyword "crash" in these warning message, logAnalyzer
+        would fail.
+
+        Fifth, systemd would fire an error message:"ERR systemd[1]: Failed to start SNMP/TEAMD container." since
+        SNMP/TEAMD container hits the limitation of restart. route_check.py also wrote an error message into syslog.
+
+    """
+    monit_ignoreRegex = [
+        ".*ERR monit.*",
+    ]
+    swss_ignoreRegex = [
+        ".*ERR swss#orchagent.*removeLag.*",
+    ]
+    pmon_ignoreRegex = [
+        ".*ERR pmon#xcvrd.*initializeGlobalConfig.*",
+        ".*ERR pmon#thermalctld.*Caught exception while initializing thermal manager.*",
+    ]
+    syncd_ignoreRegex = [
+        ".*ERR syncd#syncd.*driverEgressMemoryUpdate.*",
+        ".*ERR syncd#syncd.*brcm_sai*",
+        ".*WARNING syncd#syncd.*saiDiscover: skipping since it causes crash.*",
+    ]
+    teamd_ignoreRegex = [
+        ".*ERR teamd#teamsyncd.*readData.*netlink reports an error=-33 on reading a netlink socket.*",
+    ]
+    systemd_ignoreRegex = [
+        ".*ERR systemd.*Failed to start .* container*",
+    ]
+    kernel_ignoreRegex = [
+        ".*ERR kernel.*PortChannel.*",
+    ]
+    other_ignoreRegex = [
+        ".*ERR route_check.*",
+    ]
+    if loganalyzer:
+        loganalyzer.ignore_regex.extend(monit_ignoreRegex)
+        loganalyzer.ignore_regex.extend(swss_ignoreRegex)
+        loganalyzer.ignore_regex.extend(pmon_ignoreRegex)
+        loganalyzer.ignore_regex.extend(syncd_ignoreRegex)
+        loganalyzer.ignore_regex.extend(teamd_ignoreRegex)
+        loganalyzer.ignore_regex.extend(systemd_ignoreRegex)
+        loganalyzer.ignore_regex.extend(kernel_ignoreRegex)
+        loganalyzer.ignore_regex.extend(other_ignoreRegex)
+
+
 def get_group_program_info(duthost, container_name, group_name):
     """
     @summary: Get program names, running status and their pids by analyzing the command
@@ -34,7 +104,7 @@ def get_group_program_info(duthost, container_name, group_name):
     program_status = None
     program_pid = -1
 
-    program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
+    program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
     for program_info in program_list["stdout_lines"]:
         if program_info.find(group_name) != -1:
             program_name = program_info.split()[0].split(':')[1].strip()
@@ -59,7 +129,7 @@ def get_program_info(duthost, container_name, program_name):
     program_status = None
     program_pid = -1
 
-    program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
+    program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
     for program_info in program_list["stdout_lines"]:
         if program_info.find(program_name) != -1:
             program_status = program_info.split()[1].strip()
@@ -137,6 +207,20 @@ def is_hiting_start_limit(duthost, container_name):
     return False
 
 
+def clear_failed_flag_and_restart(duthost, container_name):
+    """
+    @summary: If a container hits the restart limitation, then we clear the failed flag and
+              restart it.
+    """
+    logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
+    duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
+    duthost.shell("sudo systemctl start {}.service".format(container_name))
+    restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
+                           CONTAINER_CHECK_INTERVAL_SECS,
+                           check_container_state, duthost, container_name, True)
+    pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))
+
+
 def verify_autorestart_with_critical_process(duthost, container_name, program_name,
                                              program_status, program_pid):
     """
@@ -165,13 +249,7 @@ def verify_autorestart_with_critical_process(duthost, container_name, program_na
                            check_container_state, duthost, container_name, True)
     if not restarted:
         if is_hiting_start_limit(duthost, container_name):
-            logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
-            duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
-            duthost.shell("sudo systemctl start {}.service".format(container_name))
-            restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
-                                   CONTAINER_CHECK_INTERVAL_SECS,
-                                   check_container_state, duthost, container_name, True)
-            pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))
+            clear_failed_flag_and_restart(duthost, container_name)
         else:
             pytest.fail("Failed to restart container '{}'".format(container_name))
 
@@ -202,6 +280,30 @@ def verify_no_autorestart_with_non_critical_process(duthost, container_name, pro
     logger.info("Restart the program '{}' in container '{}'".format(program_name, container_name))
     duthost.shell("docker exec {} supervisorctl start {}".format(container_name, program_name))
 
+def check_all_critical_processes_status(duthost):
+    processes_status = duthost.all_critical_process_status()
+    for container_name, processes in processes_status.items():
+        if processes["status"] is False or len(processes["exited_critical_process"]) > 0:
+            return False
+
+    return True
+
+
+def postcheck_critical_processes_status(duthost, container_autorestart_states):
+    """
+    @summary: Do the post check to see whether all the critical processes are alive after testing
+              the autorestart feature.
+              First we restart the containers which hit the restart limitation and then do the post check
+    """
+    for container_name in container_autorestart_states.keys():
+        if is_hiting_start_limit(duthost, container_name):
+            clear_failed_flag_and_restart(duthost, container_name)
+
+    pytest_assert(wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
+                             CONTAINER_CHECK_INTERVAL_SECS,
+                             check_all_critical_processes_status, duthost), 
+                             "Post checking the healthy of critical processes failed.")
+
 
 def test_containers_autorestart(duthost, tbinfo):
     """
@@ -275,3 +377,5 @@ def test_containers_autorestart(duthost, tbinfo):
             duthost.shell("sudo config feature autorestart {} disabled".format(container_name))
 
         logger.info("End of testing the container '{}'".format(container_name))
+
+    postcheck_critical_processes_status(duthost, container_autorestart_states)