Skip to content

Commit

Permalink
[chassis][linecard] Fix Module LINECARD<> went off-line message for e…
Browse files Browse the repository at this point in the history
…mpty slot issue (#462)

* [chassis][linecard] Fix Module LINECARD<> went off-line message for empty slot issue

Signed-off-by: mlok <marty.lok@nokia.com>

* Define/use get_module_current_status()

---------

Signed-off-by: mlok <marty.lok@nokia.com>
Co-authored-by: Arvindsrinivasan Lakshmi Narasimhan <55814491+arlakshm@users.noreply.github.com>
  • Loading branch information
mlok-nokia and arlakshm authored Apr 17, 2024
1 parent 25e22cd commit 0f61e15
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 15 deletions.
34 changes: 23 additions & 11 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ class ModuleUpdater(logger.Logger):
fvs = swsscommon.FieldValuePairs([(CHASSIS_INFO_CARD_NUM_FIELD, str(num_modules))])
self.chassis_table.set(CHASSIS_INFO_KEY_TEMPLATE.format(1), fvs)

def get_module_current_status(self, key):
fvs = self.module_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
fvs = dict(fvs[-1])
return fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
return ModuleBase.MODULE_STATUS_EMPTY

def module_db_update(self):
notOnlineModules = []

Expand All @@ -260,6 +267,7 @@ class ModuleUpdater(logger.Logger):
(CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]),
(CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS]))),
(CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])])
prev_status = self.get_module_current_status(key)
self.module_table.set(key, fvs)

# Construct key for down_modules dict. Example down_modules key format: LINE-CARD0|<hostname>
Expand All @@ -272,23 +280,27 @@ class ModuleUpdater(logger.Logger):
down_module_key = key+'|'

if module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] != str(ModuleBase.MODULE_STATUS_ONLINE):
notOnlineModules.append(key)
# Record the time when the module down was detected to track the
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# long time like 30 mins.
# All down modules including supervisor are added to the down modules dictionary. This is to help
# identifying module operational status change. But the clean up will not be attempted for supervisor
if down_module_key not in self.down_modules:
self.log_warning("Module {} went off-line!".format(key))
self.down_modules[down_module_key] = {}
self.down_modules[down_module_key]['down_time'] = time.time()
self.down_modules[down_module_key]['cleaned'] = False
if prev_status == ModuleBase.MODULE_STATUS_ONLINE:
notOnlineModules.append(key)
# Record the time when the module down was detected to track the
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# long time like 30 mins.
# All down modules including supervisor are added to the down modules dictionary. This is to help
# identifying module operational status change. But the clean up will not be attempted for supervisor

if down_module_key not in self.down_modules:
self.log_warning("Module {} went off-line!".format(key))
self.down_modules[down_module_key] = {}
self.down_modules[down_module_key]['down_time'] = time.time()
self.down_modules[down_module_key]['cleaned'] = False
continue
else:
# Module is operational. Remove it from down time tracking.
if down_module_key in self.down_modules:
self.log_notice("Module {} recovered on-line!".format(key))
del self.down_modules[down_module_key]
elif prev_status != ModuleBase.MODULE_STATUS_ONLINE:
self.log_notice("Module {} is on-line!".format(key))

for asic_id, asic in enumerate(module_info_dict[CHASSIS_MODULE_INFO_ASICS]):
asic_global_id, asic_pci_addr = asic
Expand Down
83 changes: 79 additions & 4 deletions sonic-chassisd/tests/test_chassisd.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,8 +652,83 @@ def test_chassis_db_cleanup():

# Mock >= CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD module down period for LINE-CARD1
down_module_key = lc2_name+"|"
module_down_time = sup_module_updater.down_modules[down_module_key]["down_time"]
sup_module_updater.down_modules[down_module_key]["down_time"] = module_down_time - ((CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD+10)*60)
assert down_module_key not in sup_module_updater.down_modules.keys()

sup_module_updater.module_down_chassis_db_cleanup()

def test_chassis_db_bootup_with_empty_slot():
chassis = MockChassis()

#Supervisor
index = 0
sup_name = "SUPERVISOR0"
desc = "Supervisor card"
sup_slot = 16
serial = "RP1000101"
module_type = ModuleBase.MODULE_TYPE_SUPERVISOR
supervisor = MockModule(index, sup_name, desc, module_type, sup_slot, serial)
supervisor.set_midplane_ip()
chassis.module_list.append(supervisor)

# Run module database update from supervisor to run chassis db cleanup
sup_module_updater.module_down_chassis_db_cleanup()
#Linecard 0. Host name will be pushed for this to make clean up happen
index = 1
lc_name = "LINE-CARD0"
desc = "36 port 400G card"
lc_slot = 1
serial = "LC1000101"
module_type = ModuleBase.MODULE_TYPE_LINE
module = MockModule(index, lc_name, desc, module_type, lc_slot, serial)
module.set_midplane_ip()
status = ModuleBase.MODULE_STATUS_ONLINE
module.set_oper_status(status)
chassis.module_list.append(module)

#Linecard 1. Host name will not be pushed for this so that clean up will not happen
index = 2
lc2_name = u"LINE-CARD1"
desc = "Unavailable'"
lc2_slot = 2
serial = "N/A"
module_type = ModuleBase.MODULE_TYPE_LINE
module2 = MockModule(index, lc2_name, desc, module_type, lc2_slot, serial)
module2.set_midplane_ip()
status = ModuleBase.MODULE_STATUS_EMPTY
module2.set_oper_status(status)
chassis.module_list.append(module2)

# Supervisor ModuleUpdater
sup_module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, sup_slot, sup_slot)
sup_module_updater.modules_num_update()

sup_module_updater.module_db_update()

# check LC1 STATUS ONLINE in module table
fvs = sup_module_updater.module_table.get(lc_name)
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert ModuleBase.MODULE_STATUS_ONLINE == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]

# check LC2 STATUS EMPTY in module table
fvs = sup_module_updater.module_table.get(lc2_name)
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert ModuleBase.MODULE_STATUS_EMPTY == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]

# Both should no tbe in down_module keys.

down_module_lc1_key = lc_name+"|"
assert down_module_lc1_key not in sup_module_updater.down_modules.keys()
down_module_lc2_key = lc_name+"|"
assert down_module_lc2_key not in sup_module_updater.down_modules.keys()

# Change linecard module1 status to OFFLINE
status = ModuleBase.MODULE_STATUS_OFFLINE
module.set_oper_status(status)
sup_module_updater.module_db_update()

fvs = sup_module_updater.module_table.get(lc_name)
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert status == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
assert down_module_lc1_key in sup_module_updater.down_modules.keys()

0 comments on commit 0f61e15

Please sign in to comment.