Skip to content

Commit

Permalink
Fix platform daemon chassisd to handle auto restart on fail (#247)
Browse files Browse the repository at this point in the history
Description
Add signal handlers correctly and fix clean up so that the auto restart works ok on signals.

Motivation and Context
Add signal handlers correctly and fix clean up so that the auto restart works ok on signals.

How Has This Been Tested?
Verified by running again chassis LC.
  • Loading branch information
judyjoseph authored Mar 31, 2022
1 parent 24fba04 commit 9ac12bf
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 12 deletions.
33 changes: 21 additions & 12 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ except ImportError as e:
# Constants ====================================================================
#

SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n)
for n in dir(signal) if n.startswith('SIG') and '_' not in n)

SYSLOG_IDENTIFIER = "chassisd"

CHASSIS_CFG_TABLE = 'CHASSIS_MODULE'
Expand Down Expand Up @@ -75,6 +78,10 @@ INVALID_IP = '0.0.0.0'
MODULE_ADMIN_DOWN = 0
MODULE_ADMIN_UP = 1

# This daemon should return non-zero exit code so that supervisord will
# restart it automatically.
exit_code = 0

#
# Helper functions =============================================================
#
Expand Down Expand Up @@ -366,18 +373,21 @@ class ChassisdDaemon(daemon_base.DaemonBase):

self.stop = threading.Event()

# Signal handler
# Override signal handler from DaemonBase
def signal_handler(self, sig, frame):
if sig == signal.SIGHUP:
self.log_info("Caught SIGHUP - ignoring...")
elif sig == signal.SIGINT:
self.log_info("Caught SIGINT - exiting...")
self.stop.set()
elif sig == signal.SIGTERM:
self.log_info("Caught SIGTERM - exiting...")
FATAL_SIGNALS = [signal.SIGINT, signal.SIGTERM]
NONFATAL_SIGNALS = [signal.SIGHUP]

global exit_code

if sig in FATAL_SIGNALS:
exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
self.log_info("Caught {} signal '{}' - exiting...".format(exit_code,SIGNALS_TO_NAMES_DICT[sig]))
self.stop.set()
elif sig in NONFATAL_SIGNALS:
self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
else:
self.log_warning("Caught unhandled signal '" + sig + "'")
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))

# Run daemon
def run(self):
Expand Down Expand Up @@ -421,9 +431,6 @@ class ChassisdDaemon(daemon_base.DaemonBase):

self.log_info("Stop daemon main loop")

if config_manager is not None:
config_manager.task_stop()

# Delete all the information from DB and then exit
self.module_updater.deinit()

Expand All @@ -435,9 +442,11 @@ class ChassisdDaemon(daemon_base.DaemonBase):


def main():
global exit_code
chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER)
chassisd.run()

sys.exit(exit_code)

if __name__ == '__main__':
main()
55 changes: 55 additions & 0 deletions sonic-chassisd/tests/test_chassisd.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,58 @@ def verify_fabric_asic(asic_name, asic_pci_address, module_name, asic_id_in_modu
assert fvs == None
fvs = fabric_asic_table.get("asic5")
assert fvs == None

def test_signal_handler():
exit_code = 0
daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER)
daemon_chassisd.stop.set = MagicMock()
daemon_chassisd.log_info = MagicMock()
daemon_chassisd.log_warning = MagicMock()

# Test SIGHUP
daemon_chassisd.signal_handler(signal.SIGHUP, None)
assert daemon_chassisd.log_info.call_count == 1
daemon_chassisd.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...")
assert daemon_chassisd.log_warning.call_count == 0
assert daemon_chassisd.stop.set.call_count == 0
assert exit_code == 0

# Reset
daemon_chassisd.log_info.reset_mock()
daemon_chassisd.log_warning.reset_mock()
daemon_chassisd.stop.set.reset_mock()

# Test SIGINT
test_signal = signal.SIGINT
daemon_chassisd.signal_handler(test_signal, None)
assert daemon_chassisd.log_info.call_count == 1
daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGINT' - exiting...".format(128 + test_signal))
assert daemon_chassisd.log_warning.call_count == 0
assert daemon_chassisd.stop.set.call_count == 1

# Reset
daemon_chassisd.log_info.reset_mock()
daemon_chassisd.log_warning.reset_mock()
daemon_chassisd.stop.set.reset_mock()

# Test SIGTERM
test_signal = signal.SIGTERM
daemon_chassisd.signal_handler(test_signal, None)
assert daemon_chassisd.log_info.call_count == 1
daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGTERM' - exiting...".format(128 + test_signal))
assert daemon_chassisd.log_warning.call_count == 0
assert daemon_chassisd.stop.set.call_count == 1

# Reset
daemon_chassisd.log_info.reset_mock()
daemon_chassisd.log_warning.reset_mock()
daemon_chassisd.stop.set.reset_mock()
exit_code = 0

# Test an unhandled signal
daemon_chassisd.signal_handler(signal.SIGUSR1, None)
assert daemon_chassisd.log_warning.call_count == 1
daemon_chassisd.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...")
assert daemon_chassisd.log_info.call_count == 0
assert daemon_chassisd.stop.set.call_count == 0
assert exit_code == 0

0 comments on commit 9ac12bf

Please sign in to comment.