Skip to content

Commit

Permalink
[platform] Fix the reboot SONiC stuck issue (#1130)
Browse files Browse the repository at this point in the history
Signed-off-by: Xin Wang <xinw@mellanox.com>
  • Loading branch information
wangxin authored and jleveque committed Sep 27, 2019
1 parent 658b81e commit 9546777
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 53 deletions.
15 changes: 6 additions & 9 deletions tests/common/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import json
import logging
import os
from multiprocessing import Process, Queue
from multiprocessing.pool import ThreadPool

from errors import RunAnsibleModuleFail
from errors import UnsupportedAnsibleModule
Expand Down Expand Up @@ -45,13 +45,11 @@ def _run(self, *module_args, **complex_args):
module_async = complex_args.pop('module_async', False)

if module_async:
q = Queue()
def run_module(queue, module_args, complex_args):
res = self.module(*module_args, **complex_args)
q.put(res[self.hostname])
p = Process(target=run_module, args=(q, module_args, complex_args))
p.start()
return p, q
def run_module(module_args, complex_args):
return self.module(*module_args, **complex_args)[self.hostname]
pool = ThreadPool()
result = pool.apply_async(run_module, (module_args, complex_args))
return pool, result

res = self.module(*module_args, **complex_args)[self.hostname]
if res.is_failed and not module_ignore_errors:
Expand Down Expand Up @@ -225,4 +223,3 @@ def get_pmon_daemon_list(self):

logging.info("Pmon daemon list for this platform is %s" % str(daemon_list))
return daemon_list

4 changes: 2 additions & 2 deletions tests/platform/check_daemon_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def check_pmon_daemon_status(dut):
daemon_status = {}
try:
for daemon in daemon_list:
output = dut.shell('docker exec -it pmon supervisorctl status | grep %s' % daemon, module_ignore_errors=True)
output = dut.shell('docker exec pmon supervisorctl status | grep %s' % daemon, module_ignore_errors=True)
if bool(output["stdout_lines"]):
expected_line = output["stdout_lines"][0]
expected_line_list = expected_line.split()
daemon_status[daemon] = (daemon in expected_line_list and 'RUNNING' in expected_line_list)
logging.debug("Daemon %s status is %s" % (daemon, str(daemon_status[daemon])))
else:
logging.debug("Daemon %s is not exist" % daemon)
logging.debug("Daemon %s does not exist" % daemon)
return False
return all(daemon_status.values())
except:
Expand Down
18 changes: 8 additions & 10 deletions tests/platform/check_interface_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,33 @@ def check_interface_status(dut, interfaces):
"""
@summary: Check the admin and oper status of the specified interfaces on DUT.
@param dut: The AnsibleHost object of DUT. For interacting with DUT.
@param hostname:
@param interfaces: List of interfaces that need to be checked.
"""
logging.info("Check interface status using cmd 'intfutil'")
mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"]
mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"]
output = dut.command("intfutil description")
intf_status = parse_intf_status(output["stdout_lines"][2:])
check_intf_presence_command = 'show interface transceiver presence {}'
for intf in interfaces:
expected_oper = "up" if intf in mg_ports else "down"
expected_admin = "up" if intf in mg_ports else "down"
if not intf in intf_status:
if intf not in intf_status:
logging.info("Missing status for interface %s" % intf)
return False
if intf_status[intf]["oper"] != expected_oper:
logging.info("Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"], expected_oper))
logging.info("Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"],
expected_oper))
return False
if intf_status[intf]["admin"] != expected_admin:
logging.info("Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"], expected_admin))
logging.info("Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"],
expected_admin))
return False

# Cross check the interface SFP presence status
check_presence_output = dut.command(check_intf_presence_command.format(intf))
assert check_presence_output["rc"] == 0, "Failed to read interface %s transceiver presence" % intf
logging.info(str(check_presence_output["stdout_lines"][2]))
presence_list = check_presence_output["stdout_lines"][2].split()
logging.info(str(presence_list))
assert intf in presence_list, "Wrong interface name in the output %s" % str(presence_list)
assert 'Present' in presence_list, "Status is not expected, output %s" % str(presence_list)
assert intf in presence_list, "Wrong interface name in the output: %s" % str(presence_list)
assert 'Present' in presence_list, "Status is not expected, presence status: %s" % str(presence_list)

logging.info("Check interface status using the interface_facts module")
intf_facts = dut.interface_facts(up_ports=mg_ports)["ansible_facts"]
Expand Down
86 changes: 54 additions & 32 deletions tests/platform/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@
import time
import sys

from datetime import datetime

import pytest

from platform_fixtures import conn_graph_facts
from psu_controller import psu_controller
from common.utilities import wait_until
from check_critical_services import check_critical_services
from check_transceiver_status import check_transceiver_basic
from check_daemon_status import check_pmon_daemon_status
from check_all_interface_info import check_interface_information

pytestmark = [pytest.mark.disable_loganalyzer]

REBOOT_TYPE_WARM = "warm"
Expand All @@ -30,32 +34,33 @@
REBOOT_TYPE_WATCHDOG = "watchdog"

reboot_ctrl_dict = {
REBOOT_TYPE_POWEROFF : {
"timeout" : 300,
"cause" : "Power Loss"
REBOOT_TYPE_POWEROFF: {
"timeout": 300,
"cause": "Power Loss"
},
REBOOT_TYPE_COLD : {
"command" : "reboot",
"timeout" : 300,
"cause" : "reboot"
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"cause": "reboot"
},
REBOOT_TYPE_FAST : {
"command" : "fast-reboot",
"timeout" : 180,
"cause" : "fast-reboot"
REBOOT_TYPE_FAST: {
"command": "fast-reboot",
"timeout": 180,
"cause": "fast-reboot"
},
REBOOT_TYPE_WARM : {
"command" : "warm-reboot",
"timeout" : 180,
"cause" : "warm-reboot"
REBOOT_TYPE_WARM: {
"command": "warm-reboot",
"timeout": 180,
"cause": "warm-reboot"
},
REBOOT_TYPE_WATCHDOG : {
"command" : "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout" : 300,
"cause" : "Watchdog"
REBOOT_TYPE_WATCHDOG: {
"command": "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout": 300,
"cause": "Watchdog"
}
}


def check_reboot_cause(dut, reboot_cause_expected):
"""
@summary: Check the reboot cause on DUT.
Expand All @@ -73,6 +78,7 @@ def check_reboot_cause(dut, reboot_cause_expected):
def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, reboot_helper=None, reboot_kwargs=None):
"""
Perform the specified type of reboot and check platform status.
@param localhost: The Localhost object.
@param dut: The AnsibleHost object of DUT.
@param interfaces: DUT's interfaces defined by minigraph
@param reboot_type: The reboot type, pre-defined const that has name convention of REBOOT_TYPE_XXX.
Expand All @@ -85,6 +91,9 @@ def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, r

reboot_timeout = reboot_ctrl_dict[reboot_type]["timeout"]
reboot_cause = reboot_ctrl_dict[reboot_type]["cause"]

dut_datetime = datetime.strptime(dut.command('date -u +"%Y-%m-%d %H:%M:%S"')["stdout"], "%Y-%m-%d %H:%M:%S")

if reboot_type == REBOOT_TYPE_POWEROFF:
assert reboot_helper is not None, "A reboot function must be provided for power off reboot"

Expand All @@ -93,22 +102,27 @@ def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, r
localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=120)
else:
reboot_cmd = reboot_ctrl_dict[reboot_type]["command"]

process, queue = dut.command(reboot_cmd, module_async=True)
reboot_task, reboot_res = dut.command(reboot_cmd, module_ignore_errors=True, module_async=True)

logging.info("Wait for DUT to go down")
res = localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=120,
module_ignore_errors=True)
res = localhost.wait_for(host=dut.hostname, port=22, state="stopped", timeout=180, module_ignore_errors=True)
if "failed" in res:
if process.is_alive():
logging.error("Command '%s' is not completed" % reboot_cmd)
process.terminate()
logging.error("reboot result %s" % str(queue.get()))
assert False, "DUT did not go down"
try:
logging.error("Wait for switch down failed, try to kill any possible stuck reboot task")
pid = dut.command("pgrep -f '%s'" % reboot_cmd)["stdout"]
dut.command("kill -9 %s" % pid)
reboot_task.terminate()
logging.error("Result of command '%s': " + str(reboot_res.get(timeout=0)))
except Exception as e:
logging.error("Exception raised while cleanup reboot task and get result: " + repr(e))

logging.info("Wait for DUT to come back")
localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=reboot_timeout)

logging.info("Check the uptime to verify whether reboot was performed")
dut_uptime = datetime.strptime(dut.command("uptime -s")["stdout"], "%Y-%m-%d %H:%M:%S")
assert float(dut_uptime.strftime("%s")) - float(dut_datetime.strftime("%s")) > 10, "Device did not reboot"

logging.info("Wait until all critical services are fully started")
check_critical_services(dut)

Expand Down Expand Up @@ -210,6 +224,8 @@ def _power_off_reboot_helper(kwargs):
def test_power_off_reboot(testbed_devices, conn_graph_facts, psu_controller, power_off_delay):
"""
@summary: This test case is to perform reboot via powercycle and check platform status
@param testbed_devices: Fixture initialize devices in testbed
@param conn_graph_facts: Fixture parse and return lab connection graph
@param psu_controller: The python object of psu controller
@param power_off_delay: Pytest fixture. The delay between turning off and on the PSU
"""
Expand All @@ -221,22 +237,28 @@ def test_power_off_reboot(testbed_devices, conn_graph_facts, psu_controller, pow
pytest.skip("No PSU controller for %s, skip rest of the testing in this case" % ans_host.hostname)

all_psu = psu_ctrl.get_psu_status()

# Purpose of this list is to control sequence of turning on PSUs in power off testing.
# If there are 2 PSUs, then 3 scenarios would be covered:
# 1. Turn off all PSUs, turn on PSU1, then check.
# 2. Turn off all PSUs, turn on PSU2, then check.
# 3. Turn off all PSUs, turn on one of the PSU, then turn on the other PSU, then check.
power_on_seq_list = []
if all_psu:
power_on_seq_list = [[item] for item in all_psu]
power_on_seq_list.append(all_psu)

logging.info("Got all power on sequences {}".format(power_on_seq_list))

delay_time_list = [15, 5]
poweroff_reboot_kwargs = {}
poweroff_reboot_kwargs["dut"] = ans_host
poweroff_reboot_kwargs = {"dut": ans_host}

for power_on_seq in power_on_seq_list:
poweroff_reboot_kwargs["psu_ctrl"] = psu_ctrl
poweroff_reboot_kwargs["all_psu"] = all_psu
poweroff_reboot_kwargs["power_on_seq"] = power_on_seq
poweroff_reboot_kwargs["delay_time"] = power_off_delay
reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], REBOOT_TYPE_POWEROFF, _power_off_reboot_helper, poweroff_reboot_kwargs)
reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], REBOOT_TYPE_POWEROFF,
_power_off_reboot_helper, poweroff_reboot_kwargs)


def test_watchdog_reboot(testbed_devices, conn_graph_facts):
Expand Down

0 comments on commit 9546777

Please sign in to comment.