Skip to content

Commit 9ac12bf

Browse files
authored
Fix platform daemon chassisd to handle auto restart on fail (sonic-net#247)
Description Add signal handlers correctly and fix clean up so that the auto restart works ok on signals. Motivation and Context Add signal handlers correctly and fix clean up so that the auto restart works ok on signals. How Has This Been Tested? Verified by running again chassis LC.
1 parent 24fba04 commit 9ac12bf

File tree

2 files changed

+76
-12
lines changed

2 files changed

+76
-12
lines changed

sonic-chassisd/scripts/chassisd

+21-12
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ except ImportError as e:
3030
# Constants ====================================================================
3131
#
3232

33+
SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n)
34+
for n in dir(signal) if n.startswith('SIG') and '_' not in n)
35+
3336
SYSLOG_IDENTIFIER = "chassisd"
3437

3538
CHASSIS_CFG_TABLE = 'CHASSIS_MODULE'
@@ -75,6 +78,10 @@ INVALID_IP = '0.0.0.0'
7578
MODULE_ADMIN_DOWN = 0
7679
MODULE_ADMIN_UP = 1
7780

81+
# This daemon should return non-zero exit code so that supervisord will
82+
# restart it automatically.
83+
exit_code = 0
84+
7885
#
7986
# Helper functions =============================================================
8087
#
@@ -366,18 +373,21 @@ class ChassisdDaemon(daemon_base.DaemonBase):
366373

367374
self.stop = threading.Event()
368375

369-
# Signal handler
376+
# Override signal handler from DaemonBase
370377
def signal_handler(self, sig, frame):
371-
if sig == signal.SIGHUP:
372-
self.log_info("Caught SIGHUP - ignoring...")
373-
elif sig == signal.SIGINT:
374-
self.log_info("Caught SIGINT - exiting...")
375-
self.stop.set()
376-
elif sig == signal.SIGTERM:
377-
self.log_info("Caught SIGTERM - exiting...")
378+
FATAL_SIGNALS = [signal.SIGINT, signal.SIGTERM]
379+
NONFATAL_SIGNALS = [signal.SIGHUP]
380+
381+
global exit_code
382+
383+
if sig in FATAL_SIGNALS:
384+
exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
385+
self.log_info("Caught {} signal '{}' - exiting...".format(exit_code,SIGNALS_TO_NAMES_DICT[sig]))
378386
self.stop.set()
387+
elif sig in NONFATAL_SIGNALS:
388+
self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
379389
else:
380-
self.log_warning("Caught unhandled signal '" + sig + "'")
390+
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
381391

382392
# Run daemon
383393
def run(self):
@@ -421,9 +431,6 @@ class ChassisdDaemon(daemon_base.DaemonBase):
421431

422432
self.log_info("Stop daemon main loop")
423433

424-
if config_manager is not None:
425-
config_manager.task_stop()
426-
427434
# Delete all the information from DB and then exit
428435
self.module_updater.deinit()
429436

@@ -435,9 +442,11 @@ class ChassisdDaemon(daemon_base.DaemonBase):
435442

436443

437444
def main():
445+
global exit_code
438446
chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER)
439447
chassisd.run()
440448

449+
sys.exit(exit_code)
441450

442451
if __name__ == '__main__':
443452
main()

sonic-chassisd/tests/test_chassisd.py

+55
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,58 @@ def verify_fabric_asic(asic_name, asic_pci_address, module_name, asic_id_in_modu
441441
assert fvs == None
442442
fvs = fabric_asic_table.get("asic5")
443443
assert fvs == None
444+
445+
def test_signal_handler():
446+
exit_code = 0
447+
daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER)
448+
daemon_chassisd.stop.set = MagicMock()
449+
daemon_chassisd.log_info = MagicMock()
450+
daemon_chassisd.log_warning = MagicMock()
451+
452+
# Test SIGHUP
453+
daemon_chassisd.signal_handler(signal.SIGHUP, None)
454+
assert daemon_chassisd.log_info.call_count == 1
455+
daemon_chassisd.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...")
456+
assert daemon_chassisd.log_warning.call_count == 0
457+
assert daemon_chassisd.stop.set.call_count == 0
458+
assert exit_code == 0
459+
460+
# Reset
461+
daemon_chassisd.log_info.reset_mock()
462+
daemon_chassisd.log_warning.reset_mock()
463+
daemon_chassisd.stop.set.reset_mock()
464+
465+
# Test SIGINT
466+
test_signal = signal.SIGINT
467+
daemon_chassisd.signal_handler(test_signal, None)
468+
assert daemon_chassisd.log_info.call_count == 1
469+
daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGINT' - exiting...".format(128 + test_signal))
470+
assert daemon_chassisd.log_warning.call_count == 0
471+
assert daemon_chassisd.stop.set.call_count == 1
472+
473+
# Reset
474+
daemon_chassisd.log_info.reset_mock()
475+
daemon_chassisd.log_warning.reset_mock()
476+
daemon_chassisd.stop.set.reset_mock()
477+
478+
# Test SIGTERM
479+
test_signal = signal.SIGTERM
480+
daemon_chassisd.signal_handler(test_signal, None)
481+
assert daemon_chassisd.log_info.call_count == 1
482+
daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGTERM' - exiting...".format(128 + test_signal))
483+
assert daemon_chassisd.log_warning.call_count == 0
484+
assert daemon_chassisd.stop.set.call_count == 1
485+
486+
# Reset
487+
daemon_chassisd.log_info.reset_mock()
488+
daemon_chassisd.log_warning.reset_mock()
489+
daemon_chassisd.stop.set.reset_mock()
490+
exit_code = 0
491+
492+
# Test an unhandled signal
493+
daemon_chassisd.signal_handler(signal.SIGUSR1, None)
494+
assert daemon_chassisd.log_warning.call_count == 1
495+
daemon_chassisd.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...")
496+
assert daemon_chassisd.log_info.call_count == 0
497+
assert daemon_chassisd.stop.set.call_count == 0
498+
assert exit_code == 0

0 commit comments

Comments
 (0)