Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[platform] Fix the reboot SONiC stuck issue #1130

Merged
merged 3 commits into from
Sep 27, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions tests/common/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import json
import logging
import os
from multiprocessing import Process, Queue
from multiprocessing.pool import ThreadPool

from errors import RunAnsibleModuleFail
from errors import UnsupportedAnsibleModule
Expand Down Expand Up @@ -45,13 +45,11 @@ def _run(self, *module_args, **complex_args):
module_async = complex_args.pop('module_async', False)

if module_async:
q = Queue()
def run_module(queue, module_args, complex_args):
res = self.module(*module_args, **complex_args)
q.put(res[self.hostname])
p = Process(target=run_module, args=(q, module_args, complex_args))
p.start()
return p, q
def run_module(module_args, complex_args):
return self.module(*module_args, **complex_args)[self.hostname]
pool = ThreadPool()
result = pool.apply_async(run_module, (module_args, complex_args))
return pool, result

res = self.module(*module_args, **complex_args)[self.hostname]
if res.is_failed and not module_ignore_errors:
Expand Down Expand Up @@ -225,4 +223,3 @@ def get_pmon_daemon_list(self):

logging.info("Pmon daemon list for this platform is %s" % str(daemon_list))
return daemon_list

4 changes: 2 additions & 2 deletions tests/platform/check_daemon_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def check_pmon_daemon_status(dut):
daemon_status = {}
try:
for daemon in daemon_list:
output = dut.shell('docker exec -it pmon supervisorctl status | grep %s' % daemon, module_ignore_errors=True)
output = dut.shell('docker exec pmon supervisorctl status | grep %s' % daemon, module_ignore_errors=True)
if bool(output["stdout_lines"]):
expected_line = output["stdout_lines"][0]
expected_line_list = expected_line.split()
daemon_status[daemon] = (daemon in expected_line_list and 'RUNNING' in expected_line_list)
logging.debug("Daemon %s status is %s" % (daemon, str(daemon_status[daemon])))
else:
logging.debug("Daemon %s is not exist" % daemon)
logging.debug("Daemon %s does not exist" % daemon)
return False
return all(daemon_status.values())
except:
Expand Down
18 changes: 8 additions & 10 deletions tests/platform/check_interface_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,33 @@ def check_interface_status(dut, interfaces):
"""
@summary: Check the admin and oper status of the specified interfaces on DUT.
@param dut: The AnsibleHost object of DUT. For interacting with DUT.
@param hostname:
@param interfaces: List of interfaces that need to be checked.
"""
logging.info("Check interface status using cmd 'intfutil'")
mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"]
mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"]
output = dut.command("intfutil description")
intf_status = parse_intf_status(output["stdout_lines"][2:])
check_intf_presence_command = 'show interface transceiver presence {}'
for intf in interfaces:
expected_oper = "up" if intf in mg_ports else "down"
expected_admin = "up" if intf in mg_ports else "down"
if not intf in intf_status:
if intf not in intf_status:
logging.info("Missing status for interface %s" % intf)
return False
if intf_status[intf]["oper"] != expected_oper:
logging.info("Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"], expected_oper))
logging.info("Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"],
expected_oper))
return False
if intf_status[intf]["admin"] != expected_admin:
logging.info("Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"], expected_admin))
logging.info("Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"],
expected_admin))
return False

# Cross check the interface SFP presence status
check_presence_output = dut.command(check_intf_presence_command.format(intf))
assert check_presence_output["rc"] == 0, "Failed to read interface %s transceiver presence" % intf
logging.info(str(check_presence_output["stdout_lines"][2]))
presence_list = check_presence_output["stdout_lines"][2].split()
logging.info(str(presence_list))
assert intf in presence_list, "Wrong interface name in the output %s" % str(presence_list)
assert 'Present' in presence_list, "Status is not expected, output %s" % str(presence_list)
assert intf in presence_list, "Wrong interface name in the output: %s" % str(presence_list)
assert 'Present' in presence_list, "Status is not expected, presence status: %s" % str(presence_list)

logging.info("Check interface status using the interface_facts module")
intf_facts = dut.interface_facts(up_ports=mg_ports)["ansible_facts"]
Expand Down
86 changes: 54 additions & 32 deletions tests/platform/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@
import time
import sys

from datetime import datetime

import pytest

from platform_fixtures import conn_graph_facts
from psu_controller import psu_controller
from common.utilities import wait_until
from check_critical_services import check_critical_services
from check_transceiver_status import check_transceiver_basic
from check_daemon_status import check_pmon_daemon_status
from check_all_interface_info import check_interface_information

pytestmark = [pytest.mark.disable_loganalyzer]

REBOOT_TYPE_WARM = "warm"
Expand All @@ -30,32 +34,33 @@
REBOOT_TYPE_WATCHDOG = "watchdog"

reboot_ctrl_dict = {
REBOOT_TYPE_POWEROFF : {
"timeout" : 300,
"cause" : "Power Loss"
REBOOT_TYPE_POWEROFF: {
"timeout": 300,
"cause": "Power Loss"
},
REBOOT_TYPE_COLD : {
"command" : "reboot",
"timeout" : 300,
"cause" : "reboot"
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"cause": "reboot"
},
REBOOT_TYPE_FAST : {
"command" : "fast-reboot",
"timeout" : 180,
"cause" : "fast-reboot"
REBOOT_TYPE_FAST: {
"command": "fast-reboot",
"timeout": 180,
"cause": "fast-reboot"
},
REBOOT_TYPE_WARM : {
"command" : "warm-reboot",
"timeout" : 180,
"cause" : "warm-reboot"
REBOOT_TYPE_WARM: {
"command": "warm-reboot",
"timeout": 180,
"cause": "warm-reboot"
},
REBOOT_TYPE_WATCHDOG : {
"command" : "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout" : 300,
"cause" : "Watchdog"
REBOOT_TYPE_WATCHDOG: {
"command": "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout": 300,
"cause": "Watchdog"
}
}


def check_reboot_cause(dut, reboot_cause_expected):
"""
@summary: Check the reboot cause on DUT.
Expand All @@ -73,6 +78,7 @@ def check_reboot_cause(dut, reboot_cause_expected):
def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, reboot_helper=None, reboot_kwargs=None):
"""
Perform the specified type of reboot and check platform status.
@param localhost: The Localhost object.
@param dut: The AnsibleHost object of DUT.
@param interfaces: DUT's interfaces defined by minigraph
@param reboot_type: The reboot type, pre-defined const that has name convention of REBOOT_TYPE_XXX.
Expand All @@ -85,6 +91,9 @@ def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, r

reboot_timeout = reboot_ctrl_dict[reboot_type]["timeout"]
reboot_cause = reboot_ctrl_dict[reboot_type]["cause"]

dut_datetime = datetime.strptime(dut.command('date -u +"%Y-%m-%d %H:%M:%S"')["stdout"], "%Y-%m-%d %H:%M:%S")

if reboot_type == REBOOT_TYPE_POWEROFF:
assert reboot_helper is not None, "A reboot function must be provided for power off reboot"

Expand All @@ -93,22 +102,27 @@ def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, r
localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=120)
else:
reboot_cmd = reboot_ctrl_dict[reboot_type]["command"]

process, queue = dut.command(reboot_cmd, module_async=True)
reboot_task, reboot_res = dut.command(reboot_cmd, module_ignore_errors=True, module_async=True)

logging.info("Wait for DUT to go down")
res = localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=120,
module_ignore_errors=True)
res = localhost.wait_for(host=dut.hostname, port=22, state="stopped", timeout=180, module_ignore_errors=True)
if "failed" in res:
if process.is_alive():
logging.error("Command '%s' is not completed" % reboot_cmd)
process.terminate()
logging.error("reboot result %s" % str(queue.get()))
assert False, "DUT did not go down"
try:
logging.error("Wait for switch down failed, try to kill any possible stucking reboot task")
jleveque marked this conversation as resolved.
Show resolved Hide resolved
pid = dut.command("pgrep -f '%s'" % reboot_cmd)["stdout"]
dut.command("kill -9 %s" % pid)
reboot_task.terminate()
logging.error("Result of command '%s': " + str(reboot_res.get(timeout=0)))
except Exception as e:
logging.error("Exception raised while cleanup reboot task and get result: " + repr(e))

logging.info("Wait for DUT to come back")
localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=reboot_timeout)

logging.info("Check the uptime to verify whether reboot was performed")
dut_uptime = datetime.strptime(dut.command("uptime -s")["stdout"], "%Y-%m-%d %H:%M:%S")
assert float(dut_uptime.strftime("%s")) - float(dut_datetime.strftime("%s")) > 10, "Device did not reboot"

logging.info("Wait until all critical services are fully started")
check_critical_services(dut)

Expand Down Expand Up @@ -210,6 +224,8 @@ def _power_off_reboot_helper(kwargs):
def test_power_off_reboot(testbed_devices, conn_graph_facts, psu_controller, power_off_delay):
"""
@summary: This test case is to perform reboot via powercycle and check platform status
@param testbed_devices: Fixture initialize devices in testbed
@param conn_graph_facts: Fixture parse and return lab connection graph
@param psu_controller: The python object of psu controller
@param power_off_delay: Pytest fixture. The delay between turning off and on the PSU
"""
Expand All @@ -221,22 +237,28 @@ def test_power_off_reboot(testbed_devices, conn_graph_facts, psu_controller, pow
pytest.skip("No PSU controller for %s, skip rest of the testing in this case" % ans_host.hostname)

all_psu = psu_ctrl.get_psu_status()

# Purpose of this list is to control sequence of turning on PSUs in power off testing.
# If there are 2 PSUs, then 3 scenarios would be covered:
# 1. Turn off all PSUs, turn on PSU1, then check.
# 2. Turn off all PSUs, turn on PSU2, then check.
# 3. Turn off all PSUs, turn on one of the PSU, then turn on the other PSU, then check.
power_on_seq_list = []
if all_psu:
power_on_seq_list = [[item] for item in all_psu]
power_on_seq_list.append(all_psu)

logging.info("Got all power on sequences {}".format(power_on_seq_list))

delay_time_list = [15, 5]
poweroff_reboot_kwargs = {}
poweroff_reboot_kwargs["dut"] = ans_host
poweroff_reboot_kwargs = {"dut": ans_host}

for power_on_seq in power_on_seq_list:
poweroff_reboot_kwargs["psu_ctrl"] = psu_ctrl
poweroff_reboot_kwargs["all_psu"] = all_psu
poweroff_reboot_kwargs["power_on_seq"] = power_on_seq
poweroff_reboot_kwargs["delay_time"] = power_off_delay
reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], REBOOT_TYPE_POWEROFF, _power_off_reboot_helper, poweroff_reboot_kwargs)
reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], REBOOT_TYPE_POWEROFF,
_power_off_reboot_helper, poweroff_reboot_kwargs)


def test_watchdog_reboot(testbed_devices, conn_graph_facts):
Expand Down