Skip to content

Commit

Permalink
[reload] Improve reload by using sonic.target. (sonic-net#1199)
Browse files Browse the repository at this point in the history
- What I did
To remove the list of hardcoded order-dependent lists of services to stop/restart/reset-failed.

- How I did it
Used sonic.target to stop/restart/reset-failed.

- How to verify it
Execute config reload and observe the services do restart.

Signed-off-by: Stepan Blyshchak <stepanb@nvidia.com>
  • Loading branch information
stepanblyschak authored and anand-kumar-subramanian committed Mar 2, 2021
1 parent b678e49 commit 4c74ef4
Showing 1 changed file with 21 additions and 158 deletions.
179 changes: 21 additions & 158 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@

INIT_CFG_FILE = '/etc/sonic/init_cfg.json'

SYSTEMCTL_ACTION_STOP="stop"
SYSTEMCTL_ACTION_RESTART="restart"
SYSTEMCTL_ACTION_RESET_FAILED="reset-failed"

DEFAULT_NAMESPACE = ''
CFG_LOOPBACK_PREFIX = "Loopback"
CFG_LOOPBACK_PREFIX_LEN = len(CFG_LOOPBACK_PREFIX)
Expand Down Expand Up @@ -227,54 +223,6 @@ def breakout_Ports(cm, delPorts=list(), portJson=dict(), force=False, \
# Helper functions
#

# Execute action per NPU instance for multi instance services.
def execute_systemctl_per_asic_instance(inst, event, service, action):
try:
click.echo("Executing {} of service {}@{}...".format(action, service, inst))
clicommon.run_command("systemctl {} {}@{}.service".format(action, service, inst))
except SystemExit as e:
log.log_error("Failed to execute {} of service {}@{} with error {}".format(action, service, inst, e))
# Set the event object if there is a failure and exception was raised.
event.set()

# Execute action on list of systemd services
def execute_systemctl(list_of_services, action):
num_asic = multi_asic.get_num_asics()
generated_services_list, generated_multi_instance_services = _get_sonic_generated_services(num_asic)
if ((generated_services_list == []) and
(generated_multi_instance_services == [])):
log.log_error("Failed to get generated services")
return

for service in list_of_services:
if (service + '.service' in generated_services_list):
try:
click.echo("Executing {} of service {}...".format(action, service))
clicommon.run_command("systemctl {} {}".format(action, service))
except SystemExit as e:
log.log_error("Failed to execute {} of service {} with error {}".format(action, service, e))
raise

if (service + '.service' in generated_multi_instance_services):
# With Multi NPU, Start a thread per instance to do the "action" on multi instance services.
if multi_asic.is_multi_asic():
threads = []
# Use this event object to co-ordinate if any threads raised exception
e = threading.Event()

kwargs = {'service': service, 'action': action}
for inst in range(num_asic):
t = threading.Thread(target=execute_systemctl_per_asic_instance, args=(inst, e), kwargs=kwargs)
threads.append(t)
t.start()

# Wait for all the threads to finish.
for inst in range(num_asic):
threads[inst].join()

# Check if any of the threads have raised exception, if so exit the process.
if e.is_set():
sys.exit(1)

def _get_device_type():
"""
Expand Down Expand Up @@ -723,110 +671,25 @@ def _get_disabled_services_list(config_db):

return disabled_services_list

def _stop_services(config_db):
# This list is order-dependent. Please add services in the order they should be stopped
# on Mellanox platform pmon is stopped by syncd
services_to_stop = [
'telemetry',
'restapi',
'swss',
'lldp',
'pmon',
'bgp',
'hostcfgd',
'nat'
]

if asic_type == 'mellanox' and 'pmon' in services_to_stop:
services_to_stop.remove('pmon')

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_stop:
services_to_stop.remove(service)

execute_systemctl(services_to_stop, SYSTEMCTL_ACTION_STOP)


def _reset_failed_services(config_db):
# This list is order-independent. Please keep list in alphabetical order
services_to_reset = [
'bgp',
'dhcp_relay',
'hostcfgd',
'hostname-config',
'interfaces-config',
'lldp',
'mux',
'nat',
'ntp-config',
'pmon',
'radv',
'restapi',
'rsyslog-config',
'sflow',
'snmp',
'swss',
'syncd',
'teamd',
'telemetry',
'macsec',
]

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_reset:
services_to_reset.remove(service)

execute_systemctl(services_to_reset, SYSTEMCTL_ACTION_RESET_FAILED)


def _restart_services(config_db):
# This list is order-dependent. Please add services in the order they should be started
# on Mellanox platform pmon is started by syncd
services_to_restart = [
'hostname-config',
'interfaces-config',
'ntp-config',
'rsyslog-config',
'swss',
'mux',
'bgp',
'pmon',
'lldp',
'hostcfgd',
'nat',
'sflow',
'restapi',
'telemetry',
'macsec',
]

result = 0

disabled_services = _get_disabled_services_list(config_db)

for service in disabled_services:
if service in services_to_restart:
services_to_restart.remove(service)

if asic_type == 'mellanox' and 'pmon' in services_to_restart:
services_to_restart.remove('pmon')
def _stop_services():
click.echo("Stopping SONiC target ...")
clicommon.run_command("sudo systemctl stop sonic.target")

try:
execute_systemctl(services_to_restart, SYSTEMCTL_ACTION_RESTART)

# Reload Monit configuration to pick up new hostname in case it changed
click.echo("Reloading Monit configuration ...")
clicommon.run_command("sudo monit reload")
except Exception as e:
log.log_error("'_restart_services' start services failed, error: {}".format(e))
result = 1
def _get_sonic_services():
out = clicommon.run_command("systemctl list-dependencies --plain sonic.target | sed '1d'", return_cmd=True)
return [unit.strip() for unit in out.splitlines()]


def _reset_failed_services():
for service in _get_sonic_services():
click.echo("Resetting failed status on {}".format(service))
clicommon.run_command("systemctl reset-failed {}".format(service))

return result

def _restart_services():
click.echo("Restarting SONiC target ...")
clicommon.run_command("sudo systemctl restart sonic.target")

def interface_is_in_vlan(vlan_member_table, interface_name):
""" Check if an interface is in a vlan """
Expand Down Expand Up @@ -1181,7 +1044,7 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart, background):
# Stop services before config push
if not no_service_restart:
log.log_info("'reload' stopping services...")
_stop_services(db.cfgdb)
_stop_services()
except Exception as e:
log.log_error("'reload' failed at stop services, error: {}".format(e))

Expand Down Expand Up @@ -1256,9 +1119,9 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart, background):
# We first run "systemctl reset-failed" to remove the "failed"
# status from all services before we attempt to restart them
if not no_service_restart:
_reset_failed_services(db.cfgdb)
_reset_failed_services()
log.log_info("'reload' restarting services...")
rc = _restart_services(db.cfgdb)
rc = _restart_services()
rv |= rc
if rv == 0:
log.log_info("'reload' complete!")
Expand Down Expand Up @@ -1315,7 +1178,7 @@ def load_minigraph(db, no_service_restart):
#Stop services before config push
if not no_service_restart:
log.log_info("'load_minigraph' stopping services...")
_stop_services(db.cfgdb)
_stop_services()

# For Single Asic platform the namespace list has the empty string
# for mulit Asic platform the empty string to generate the config
Expand Down Expand Up @@ -1371,10 +1234,10 @@ def load_minigraph(db, no_service_restart):
# We first run "systemctl reset-failed" to remove the "failed"
# status from all services before we attempt to restart them
if not no_service_restart:
_reset_failed_services(db.cfgdb)
_reset_failed_services()
#FIXME: After config DB daemon is implemented, we'll no longer need to restart every service.
log.log_info("'load_minigraph' restarting services...")
_restart_services(db.cfgdb)
_restart_services()
click.echo("Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.")


Expand Down

0 comments on commit 4c74ef4

Please sign in to comment.