Skip to content

Commit

Permalink
[reload] Improve reload by using sonic.target. (sonic-net#1199)
Browse files Browse the repository at this point in the history
- What I did
To remove the list of hardcoded order-dependent lists of services to stop/restart/reset-failed.

- How I did it
Used sonic.target to stop/restart/reset-failed.

- How to verify it
Execute config reload and observe the services do restart.

Signed-off-by: Stepan Blyshchak <stepanb@nvidia.com>
  • Loading branch information
stepanblyschak authored Mar 1, 2021
1 parent 99673bc commit 8b3bc18
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 203 deletions.
176 changes: 26 additions & 150 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import re
import subprocess
import sys
import threading
import time

from socket import AF_INET, AF_INET6
Expand Down Expand Up @@ -63,10 +62,6 @@

INIT_CFG_FILE = '/etc/sonic/init_cfg.json'

SYSTEMCTL_ACTION_STOP="stop"
SYSTEMCTL_ACTION_RESTART="restart"
SYSTEMCTL_ACTION_RESET_FAILED="reset-failed"

DEFAULT_NAMESPACE = ''
CFG_LOOPBACK_PREFIX = "Loopback"
CFG_LOOPBACK_PREFIX_LEN = len(CFG_LOOPBACK_PREFIX)
Expand Down Expand Up @@ -224,54 +219,6 @@ def breakout_Ports(cm, delPorts=list(), portJson=dict(), force=False, \
# Helper functions
#

# Execute action per NPU instance for multi instance services.
def execute_systemctl_per_asic_instance(inst, event, service, action):
try:
click.echo("Executing {} of service {}@{}...".format(action, service, inst))
clicommon.run_command("systemctl {} {}@{}.service".format(action, service, inst))
except SystemExit as e:
log.log_error("Failed to execute {} of service {}@{} with error {}".format(action, service, inst, e))
# Set the event object if there is a failure and exception was raised.
event.set()

# Execute action on list of systemd services
def execute_systemctl(list_of_services, action):
num_asic = multi_asic.get_num_asics()
generated_services_list, generated_multi_instance_services = _get_sonic_generated_services(num_asic)
if ((generated_services_list == []) and
(generated_multi_instance_services == [])):
log.log_error("Failed to get generated services")
return

for service in list_of_services:
if (service + '.service' in generated_services_list):
try:
click.echo("Executing {} of service {}...".format(action, service))
clicommon.run_command("systemctl {} {}".format(action, service))
except SystemExit as e:
log.log_error("Failed to execute {} of service {} with error {}".format(action, service, e))
raise

if (service + '.service' in generated_multi_instance_services):
# With Multi NPU, Start a thread per instance to do the "action" on multi instance services.
if multi_asic.is_multi_asic():
threads = []
# Use this event object to co-ordinate if any threads raised exception
e = threading.Event()

kwargs = {'service': service, 'action': action}
for inst in range(num_asic):
t = threading.Thread(target=execute_systemctl_per_asic_instance, args=(inst, e), kwargs=kwargs)
threads.append(t)
t.start()

# Wait for all the threads to finish.
for inst in range(num_asic):
threads[inst].join()

# Check if any of the threads have raised exception, if so exit the process.
if e.is_set():
sys.exit(1)

def _get_device_type():
"""
Expand Down Expand Up @@ -720,97 +667,26 @@ def _get_disabled_services_list(config_db):

return disabled_services_list

def _stop_services(config_db):
# This list is order-dependent. Please add services in the order they should be stopped
# on Mellanox platform pmon is stopped by syncd
services_to_stop = [
'telemetry',
'restapi',
'swss',
'lldp',
'pmon',
'bgp',
'hostcfgd',
'nat'
]

if asic_type == 'mellanox' and 'pmon' in services_to_stop:
services_to_stop.remove('pmon')

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_stop:
services_to_stop.remove(service)

execute_systemctl(services_to_stop, SYSTEMCTL_ACTION_STOP)


def _reset_failed_services(config_db):
# This list is order-independent. Please keep list in alphabetical order
services_to_reset = [
'bgp',
'dhcp_relay',
'hostcfgd',
'hostname-config',
'interfaces-config',
'lldp',
'mux',
'nat',
'ntp-config',
'pmon',
'radv',
'restapi',
'rsyslog-config',
'sflow',
'snmp',
'swss',
'syncd',
'teamd',
'telemetry',
'macsec',
]

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_reset:
services_to_reset.remove(service)

execute_systemctl(services_to_reset, SYSTEMCTL_ACTION_RESET_FAILED)


def _restart_services(config_db):
# This list is order-dependent. Please add services in the order they should be started
# on Mellanox platform pmon is started by syncd
services_to_restart = [
'hostname-config',
'interfaces-config',
'ntp-config',
'rsyslog-config',
'swss',
'mux',
'bgp',
'pmon',
'lldp',
'hostcfgd',
'nat',
'sflow',
'restapi',
'telemetry',
'macsec',
]

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_restart:
services_to_restart.remove(service)

if asic_type == 'mellanox' and 'pmon' in services_to_restart:
services_to_restart.remove('pmon')

execute_systemctl(services_to_restart, SYSTEMCTL_ACTION_RESTART)

def _stop_services():
click.echo("Stopping SONiC target ...")
clicommon.run_command("sudo systemctl stop sonic.target")


def _get_sonic_services():
out = clicommon.run_command("systemctl list-dependencies --plain sonic.target | sed '1d'", return_cmd=True)
return [unit.strip() for unit in out.splitlines()]


def _reset_failed_services():
for service in _get_sonic_services():
click.echo("Resetting failed status on {}".format(service))
clicommon.run_command("systemctl reset-failed {}".format(service))


def _restart_services():
click.echo("Restarting SONiC target ...")
clicommon.run_command("sudo systemctl restart sonic.target")

# Reload Monit configuration to pick up new hostname in case it changed
click.echo("Reloading Monit configuration ...")
Expand Down Expand Up @@ -1115,7 +991,7 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart):
#Stop services before config push
if not no_service_restart:
log.log_info("'reload' stopping services...")
_stop_services(db.cfgdb)
_stop_services()

# In Single ASIC platforms we have single DB service. In multi-ASIC platforms we have a global DB
# service running in the host + DB services running in each ASIC namespace created per ASIC.
Expand Down Expand Up @@ -1186,9 +1062,9 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart):
# We first run "systemctl reset-failed" to remove the "failed"
# status from all services before we attempt to restart them
if not no_service_restart:
_reset_failed_services(db.cfgdb)
_reset_failed_services()
log.log_info("'reload' restarting services...")
_restart_services(db.cfgdb)
_restart_services()

@config.command("load_mgmt_config")
@click.option('-y', '--yes', is_flag=True, callback=_abort_if_false,
Expand Down Expand Up @@ -1227,7 +1103,7 @@ def load_minigraph(db, no_service_restart):
#Stop services before config push
if not no_service_restart:
log.log_info("'load_minigraph' stopping services...")
_stop_services(db.cfgdb)
_stop_services()

# For Single Asic platform the namespace list has the empty string
# for mulit Asic platform the empty string to generate the config
Expand Down Expand Up @@ -1283,10 +1159,10 @@ def load_minigraph(db, no_service_restart):
# We first run "systemctl reset-failed" to remove the "failed"
# status from all services before we attempt to restart them
if not no_service_restart:
_reset_failed_services(db.cfgdb)
_reset_failed_services()
#FIXME: After config DB daemon is implemented, we'll no longer need to restart every service.
log.log_info("'load_minigraph' restarting services...")
_restart_services(db.cfgdb)
_restart_services()
click.echo("Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.")


Expand Down
60 changes: 7 additions & 53 deletions tests/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,53 +12,24 @@
from utilities_common.db import Db

load_minigraph_command_output="""\
Executing stop of service telemetry...
Executing stop of service swss...
Executing stop of service lldp...
Executing stop of service pmon...
Executing stop of service bgp...
Executing stop of service hostcfgd...
Executing stop of service nat...
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
Running command: pfcwd start_default
Running command: config qos reload --no-dynamic-buffer
Executing reset-failed of service bgp...
Executing reset-failed of service dhcp_relay...
Executing reset-failed of service hostcfgd...
Executing reset-failed of service hostname-config...
Executing reset-failed of service interfaces-config...
Executing reset-failed of service lldp...
Executing reset-failed of service nat...
Executing reset-failed of service ntp-config...
Executing reset-failed of service pmon...
Executing reset-failed of service radv...
Executing reset-failed of service rsyslog-config...
Executing reset-failed of service snmp...
Executing reset-failed of service swss...
Executing reset-failed of service syncd...
Executing reset-failed of service teamd...
Executing reset-failed of service telemetry...
Executing restart of service hostname-config...
Executing restart of service interfaces-config...
Executing restart of service ntp-config...
Executing restart of service rsyslog-config...
Executing restart of service swss...
Executing restart of service bgp...
Executing restart of service pmon...
Executing restart of service lldp...
Executing restart of service hostcfgd...
Executing restart of service nat...
Executing restart of service telemetry...
Restarting SONiC target ...
Reloading Monit configuration ...
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
"""

def mock_run_command_side_effect(*args, **kwargs):
command = args[0]

if 'display_cmd' in kwargs and kwargs['display_cmd'] == True:
if kwargs.get('display_cmd'):
click.echo(click.style("Running command: ", fg='cyan') + click.style(command, fg='green'))

if kwargs.get('return_cmd'):
return ''


class TestLoadMinigraph(object):
@classmethod
Expand All @@ -78,24 +49,7 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output
assert mock_run_command.call_count == 38

def test_load_minigraph_with_disabled_telemetry(self, get_cmd_module, setup_single_broadcom_asic):
with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
(config, show) = get_cmd_module
db = Db()
runner = CliRunner()
result = runner.invoke(config.config.commands["feature"].commands["state"], ["telemetry", "disabled"], obj=db)
assert result.exit_code == 0
result = runner.invoke(show.cli.commands["feature"].commands["status"], ["telemetry"], obj=db)
print(result.output)
assert result.exit_code == 0
result = runner.invoke(config.config.commands["load_minigraph"], ["-y"], obj=db)
print(result.exit_code)
print(result.output)
assert result.exit_code == 0
assert "telemetry" not in result.output
assert mock_run_command.call_count == 35
assert mock_run_command.call_count == 7

@classmethod
def teardown_class(cls):
Expand Down

0 comments on commit 8b3bc18

Please sign in to comment.