From 0f9edf43ffc45d9b3493878e1abe75914d6a4d01 Mon Sep 17 00:00:00 2001 From: Perumal Venkatesh Date: Wed, 6 Nov 2024 17:26:34 -0800 Subject: [PATCH] Increase startup_tsa_tsb time based on Cisco's observation. (#15367) Description of PR We are noticing that the time diff is ranging from 120-145 secs. Hence increasing it to 160secs to be on the safer side. After increasing the time we are seeing all testcases passing with all other changes that was added in PR #13290 In our case, since kdump is enabled, during abnormal reboot case, our reboot-cause is Kernel Panic. Made an appropriate change for Cisco chassis Approach What is the motivation for this PR? Check the functionality with a slight increase in time How did you do it? How did you verify/test it? Any platform specific information? Validated on Cisco 8808 chassis with T2 profile Supported testbed topology if it's a new test case? Documentation =========================== short test summary info ============================ PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_cold_reboot[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_abnormal_reboot[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_user_init_tsa[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsa_while_service_run_on_dut[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_while_service_run_on_dut[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_timer_efficiency[sfd-lt2-lc0] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_cold_reboot[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_abnormal_reboot[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_user_init_tsa[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsa_while_service_run_on_dut[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_while_service_run_on_dut[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_timer_efficiency[sfd-lt2-lc1] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_supervisor_cold_reboot[sfd-lt2-sup] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_supervisor_abnormal_reboot[sfd-lt2-sup] PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_on_sup_while_service_run_on_dut[sfd-lt2-sup] PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_tsa_on_sup[sfd-lt2-sup] ================= 16 passed, 1 warning in 31255.04s (8:40:55) ================== co-authorized by: jianquanye@microsoft.com --- tests/bgp/test_reliable_tsa.py | 8 ++-- tests/bgp/test_startup_tsa_tsb_service.py | 48 ++++++++++++++++------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/tests/bgp/test_reliable_tsa.py b/tests/bgp/test_reliable_tsa.py index e956d6ef26a..928e9590d97 100644 --- a/tests/bgp/test_reliable_tsa.py +++ b/tests/bgp/test_reliable_tsa.py @@ -850,7 +850,7 @@ def test_sup_tsa_act_with_sup_reboot(duthosts, localhost, enum_supervisor_dut_ho logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in the same maintenance state like before supervisor reboot @@ -1043,7 +1043,7 @@ def test_dut_tsa_act_with_reboot_when_sup_dut_on_tsb_init(duthosts, localhost, e logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify startup_tsa_tsb service is not started and in exited due to manual TSA pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'exited'), @@ -1355,7 +1355,7 @@ def test_sup_tsa_when_startup_tsa_tsb_service_running(duthosts, localhost, enum_ logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify startup_tsa_tsb service is started and running pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'running'), @@ -1464,7 +1464,7 @@ def test_sup_tsb_when_startup_tsa_tsb_service_running(duthosts, localhost, enum_ logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify startup_tsa_tsb service is started and running pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'running'), diff --git a/tests/bgp/test_startup_tsa_tsb_service.py b/tests/bgp/test_startup_tsa_tsb_service.py index 2b3e779b328..4170fdb766a 100644 --- a/tests/bgp/test_startup_tsa_tsb_service.py +++ b/tests/bgp/test_startup_tsa_tsb_service.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) - +KERNEL_PANIC_REBOOT_CAUSE = "Kernel Panic" COLD_REBOOT_CAUSE = 'cold' UNKNOWN_REBOOT_CAUSE = "Unknown" SUP_REBOOT_CAUSE = 'Reboot from Supervisor' @@ -209,7 +209,7 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -325,7 +325,7 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() logger.info("Time difference between dut up-time & tsa_tsb_service up-time is {}".format(int(time_diff))) - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Make sure BGP containers are running properly before verifying @@ -384,8 +384,17 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut") reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + out = duthost.command('show kdump config') + if "Enabled" not in out["stdout"]: + pytest_assert( + reboot_cause == UNKNOWN_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE) + ) + else: + pytest_assert( + reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE) + ) @pytest.mark.disable_loganalyzer @@ -442,7 +451,7 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -592,7 +601,7 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Make sure BGP containers are running properly before verifying @@ -669,8 +678,17 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en # Make sure the Supervisor's reboot cause is as expected logger.info("Check reboot cause of the supervisor") reboot_cause = get_reboot_cause(suphost) - pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + out = suphost.command('show kdump config') + if "Enabled" not in out["stdout"]: + pytest_assert( + reboot_cause == UNKNOWN_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE) + ) + else: + pytest_assert( + reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE) + ) @pytest.mark.disable_loganalyzer @@ -718,7 +736,7 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Ensure startup_tsa_tsb service is in exited state after dut reboot @@ -825,7 +843,7 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -941,7 +959,7 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -1059,7 +1077,7 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -1184,7 +1202,7 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") logging.info("Wait until all critical services are fully started") @@ -1309,7 +1327,7 @@ def test_tsa_tsb_service_with_tsa_on_sup(duthosts, localhost, logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 160, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state.