Skip to content

Commit

Permalink
Increase startup_tsa_tsb time based on Cisco's observation. (sonic-ne…
Browse files Browse the repository at this point in the history
…t#15367)

Description of PR
We are noticing that the time diff is ranging from 120-145 secs. Hence increasing it to 160secs to be on the safer side. After increasing the time we are seeing all testcases passing with all other changes that was added in PR sonic-net#13290

In our case, since kdump is enabled, during abnormal reboot case, our reboot-cause is Kernel Panic. Made an appropriate change for Cisco chassis

Approach
What is the motivation for this PR?
Check the functionality with a slight increase in time

How did you do it?
How did you verify/test it?
Any platform specific information?
Validated on Cisco 8808 chassis with T2 profile

Supported testbed topology if it's a new test case?
Documentation
=========================== short test summary info ============================
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_cold_reboot[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_abnormal_reboot[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_user_init_tsa[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsa_while_service_run_on_dut[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_while_service_run_on_dut[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_timer_efficiency[sfd-lt2-lc0]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_cold_reboot[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_dut_abnormal_reboot[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_user_init_tsa[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsa_while_service_run_on_dut[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_while_service_run_on_dut[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_timer_efficiency[sfd-lt2-lc1]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_supervisor_cold_reboot[sfd-lt2-sup]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_supervisor_abnormal_reboot[sfd-lt2-sup]
PASSED bgp/test_startup_tsa_tsb_service.py::test_user_init_tsb_on_sup_while_service_run_on_dut[sfd-lt2-sup]
PASSED bgp/test_startup_tsa_tsb_service.py::test_tsa_tsb_service_with_tsa_on_sup[sfd-lt2-sup]
================= 16 passed, 1 warning in 31255.04s (8:40:55) ==================

co-authorized by: jianquanye@microsoft.com
  • Loading branch information
vperumal authored and mssonicbld committed Nov 7, 2024
1 parent 55644dd commit 0f9edf4
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 19 deletions.
8 changes: 4 additions & 4 deletions tests/bgp/test_reliable_tsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ def test_sup_tsa_act_with_sup_reboot(duthosts, localhost, enum_supervisor_dut_ho
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in the same maintenance state like before supervisor reboot
Expand Down Expand Up @@ -1043,7 +1043,7 @@ def test_dut_tsa_act_with_reboot_when_sup_dut_on_tsb_init(duthosts, localhost, e
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")
# Verify startup_tsa_tsb service is not started and in exited due to manual TSA
pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'exited'),
Expand Down Expand Up @@ -1355,7 +1355,7 @@ def test_sup_tsa_when_startup_tsa_tsb_service_running(duthosts, localhost, enum_
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")
# Verify startup_tsa_tsb service is started and running
pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'running'),
Expand Down Expand Up @@ -1464,7 +1464,7 @@ def test_sup_tsb_when_startup_tsa_tsb_service_running(duthosts, localhost, enum_
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")
# Verify startup_tsa_tsb service is started and running
pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'running'),
Expand Down
48 changes: 33 additions & 15 deletions tests/bgp/test_startup_tsa_tsb_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

logger = logging.getLogger(__name__)


KERNEL_PANIC_REBOOT_CAUSE = "Kernel Panic"
COLD_REBOOT_CAUSE = 'cold'
UNKNOWN_REBOOT_CAUSE = "Unknown"
SUP_REBOOT_CAUSE = 'Reboot from Supervisor'
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one
logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down Expand Up @@ -325,7 +325,7 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
logger.info("Time difference between dut up-time & tsa_tsb_service up-time is {}".format(int(time_diff)))
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Make sure BGP containers are running properly before verifying
Expand Down Expand Up @@ -384,8 +384,17 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand
# Make sure the dut's reboot cause is as expected
logger.info("Check reboot cause of the dut")
reboot_cause = get_reboot_cause(duthost)
pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE))
out = duthost.command('show kdump config')
if "Enabled" not in out["stdout"]:
pytest_assert(
reboot_cause == UNKNOWN_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)
)
else:
pytest_assert(
reboot_cause == KERNEL_PANIC_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE)
)


@pytest.mark.disable_loganalyzer
Expand Down Expand Up @@ -442,7 +451,7 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down Expand Up @@ -592,7 +601,7 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Make sure BGP containers are running properly before verifying
Expand Down Expand Up @@ -669,8 +678,17 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en
# Make sure the Supervisor's reboot cause is as expected
logger.info("Check reboot cause of the supervisor")
reboot_cause = get_reboot_cause(suphost)
pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE))
out = suphost.command('show kdump config')
if "Enabled" not in out["stdout"]:
pytest_assert(
reboot_cause == UNKNOWN_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)
)
else:
pytest_assert(
reboot_cause == KERNEL_PANIC_REBOOT_CAUSE,
"Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE)
)


@pytest.mark.disable_loganalyzer
Expand Down Expand Up @@ -718,7 +736,7 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p
logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Ensure startup_tsa_tsb service is in exited state after dut reboot
Expand Down Expand Up @@ -825,7 +843,7 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o
logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down Expand Up @@ -941,7 +959,7 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o
logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down Expand Up @@ -1059,7 +1077,7 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost,
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down Expand Up @@ -1184,7 +1202,7 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f
logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(duthost)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

logging.info("Wait until all critical services are fully started")
Expand Down Expand Up @@ -1309,7 +1327,7 @@ def test_tsa_tsb_service_with_tsa_on_sup(duthosts, localhost,
logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime))
service_uptime = get_tsa_tsb_service_uptime(linecard)
time_diff = (service_uptime - dut_uptime).total_seconds()
pytest_assert(int(time_diff) < 120,
pytest_assert(int(time_diff) < 160,
"startup_tsa_tsb service started much later than the expected time after dut reboot")

# Verify DUT is in maintenance state.
Expand Down

0 comments on commit 0f9edf4

Please sign in to comment.