Skip to content

Commit

Permalink
[xcvrd] Extend xcvrd with SFP error event handling (sonic-net#52)
Browse files Browse the repository at this point in the history
* extend xcvrd with SFP error event handling
* change sfp error table to status table, store plug in/out and error status
using enum for sfp error code
  • Loading branch information
keboliu authored Apr 16, 2020
1 parent 97e40ce commit 238fc06
Showing 1 changed file with 95 additions and 9 deletions.
104 changes: 95 additions & 9 deletions sonic-xcvrd/scripts/xcvrd
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ try:
from sonic_daemon_base import daemon_base
from sonic_daemon_base.daemon_base import Logger
from sonic_daemon_base.daemon_base import DaemonBase
from enum import Enum
except ImportError, e:
raise ImportError (str(e) + " - required module not found")

Expand All @@ -33,15 +34,25 @@ PLATFORM_SPECIFIC_CLASS_NAME = "SfpUtil"

TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO'
TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR'
TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS'

SELECT_TIMEOUT_MSECS = 1000

DOM_INFO_UPDATE_PERIOD_SECS = 60
TIME_FOR_SFP_READY_SECS = 1
XCVRD_MAIN_THREAD_SLEEP_SECS = 60

SFP_STATUS_INSERTED = '1'
# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
SFP_STATUS_REMOVED = '0'
SFP_STATUS_INSERTED = '1'

# SFP error code enum, new elements can be added to the enum if new errors need to be supported.
SFP_STATUS_ERR_ENUM = Enum('SFP_STATUS_ERR_ENUM', ['SFP_STATUS_ERR_I2C_STUCK', 'SFP_STATUS_ERR_BAD_EEPROM',
'SFP_STATUS_ERR_UNSUPPORTED_CABLE', 'SFP_STATUS_ERR_HIGH_TEMP',
'SFP_STATUS_ERR_BAD_CABLE'], start=2)

# Convert the error code to string and store them in a set for convenience
errors_block_eeprom_reading = set(str(error_code.value) for error_code in SFP_STATUS_ERR_ENUM)

EVENT_ON_ALL_SFP = '-1'
# events definition
Expand Down Expand Up @@ -411,23 +422,25 @@ def del_port_sfp_dom_info_from_db(logical_port_name, int_tbl, dom_tbl):
ganged_member_num += 1

try:
int_tbl._del(port_name)
dom_tbl._del(port_name)
if int_tbl != None:
int_tbl._del(port_name)
if dom_tbl != None:
dom_tbl._del(port_name)

except NotImplementedError:
logger.log_error("This functionality is currently not implemented for this platform")
sys.exit(NOT_IMPLEMENTED_ERROR)

# recover missing sfp table entries if any
def recover_missing_sfp_table_entries(sfp_util, int_tbl, stop_event):
def recover_missing_sfp_table_entries(sfp_util, int_tbl, status_tbl, stop_event):
transceiver_dict = {}

keys = int_tbl.getKeys()
logical_port_list = sfp_util.logical
for logical_port_name in logical_port_list:
if stop_event.is_set():
break
if logical_port_name not in keys:
if logical_port_name not in keys and not detect_port_in_error_status(logical_port_name, status_tbl):
post_port_sfp_info_to_db(logical_port_name, int_tbl, transceiver_dict, stop_event)


Expand Down Expand Up @@ -641,6 +654,53 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait):
if time_diff < time_to_wait:
time.sleep(time_to_wait - time_diff)

# Update port SFP status table on receiving SFP change event
def update_port_transceiver_status_table(logical_port_name, status_tbl, status):
fvs = swsscommon.FieldValuePairs([('status', status)])
status_tbl.set(logical_port_name, fvs)

# Delete port from SFP status table
def delete_port_from_status_table(logical_port_name, status_tbl):
status_tbl._del(logical_port_name)

# Check whether port in error status
def detect_port_in_error_status(logical_port_name, status_tbl):
rec, fvp = status_tbl.get(logical_port_name)
if rec:
status_dict = dict(fvp)
if status_dict['status'] in errors_block_eeprom_reading:
return True
else:
return False
else:
return False

# Init TRANSCEIVER_STATUS table
def init_port_sfp_status_tbl(stop_event=threading.Event()):
# Connect to STATE_DB and create transceiver status table
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)

# Init TRANSCEIVER_STATUS table
logical_port_list = platform_sfputil.logical
for logical_port_name in logical_port_list:
if stop_event.is_set():
break
physical_port_list = logical_port_name_to_physical_port_list(logical_port_name)
if physical_port_list is None:
logger.log_error("No physical ports found for logical port '%s'" % logical_port_name)
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_REMOVED)

for physical_port in physical_port_list:
if stop_event.is_set():
break

if not _wrapper_get_presence(physical_port):
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_REMOVED)
else:
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_INSERTED)


#
# Helper classes ===============================================================
#
Expand All @@ -657,13 +717,15 @@ class dom_info_update_task:
# Connect to STATE_DB and create transceiver dom info table
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)

# Start loop to update dom info in DB periodically
while not self.task_stopping_event.wait(DOM_INFO_UPDATE_PERIOD_SECS):
logical_port_list = platform_sfputil.logical
for logical_port_name in logical_port_list:
post_port_dom_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
if not detect_port_in_error_status(logical_port_name, status_tbl):
post_port_dom_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)

logger.log_info("Stop DOM monitoring loop")

Expand Down Expand Up @@ -716,6 +778,7 @@ class sfp_state_update_task:
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
int_tbl = swsscommon.Table(state_db, TRANSCEIVER_INFO_TABLE)
dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)

# Connect to APPL_DB to notify Media notifications
appl_db = daemon_base.db_connect(swsscommon.APPL_DB)
Expand Down Expand Up @@ -846,6 +909,9 @@ class sfp_state_update_task:
for logical_port in logical_port_list:
if value == SFP_STATUS_INSERTED:
logger.log_info("Got SFP inserted event")
# A plugin event will clear the error state.
update_port_transceiver_status_table(logical_port, status_tbl, SFP_STATUS_INSERTED)
logger.log_info("receive plug in and update port sfp status table.")
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
if rc == SFP_EEPROM_NOT_READY:
Expand All @@ -858,9 +924,23 @@ class sfp_state_update_task:
transceiver_dict.clear()
elif value == SFP_STATUS_REMOVED:
logger.log_info("Got SFP removed event")
update_port_transceiver_status_table(logical_port, status_tbl, SFP_STATUS_REMOVED)
logger.log_info("receive plug out and pdate port sfp status table.")
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl)
elif value in errors_block_eeprom_reading:
logger.log_info("Got SFP Error event")
# Add port to error table to stop accessing eeprom of it
# If the port already in the error table, the stored error code will
# be updated to the new one.
update_port_transceiver_status_table(logical_port, status_tbl, value)
logger.log_info("receive error update port sfp status table.")
# In this case EEPROM is not accessible, so remove the DOM info
# since it will be outdated if long time no update.
# but will keep the interface info in the DB since it static.
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl)

else:
# TODO, SFP return error code, need handle accordingly.
# SFP return unkown event, just ignore for now.
logger.log_warning("Got unknown event {}, ignored".format(value))
continue
else:
Expand Down Expand Up @@ -1012,6 +1092,7 @@ class DaemonXcvrd(DaemonBase):
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
self.int_tbl = swsscommon.Table(state_db, TRANSCEIVER_INFO_TABLE)
self.dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
self.status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)

self.load_media_settings()
warmstart = swsscommon.WarmStart()
Expand All @@ -1027,6 +1108,10 @@ class DaemonXcvrd(DaemonBase):
logger.log_info("Post all port DOM/SFP info to DB")
post_port_sfp_dom_info_to_db(is_warm_start, self.stop_event)

# Init port sfp status table
logger.log_info("Init port sfp status table")
init_port_sfp_status_tbl(self.stop_event)

# Deinitialize daemon
def deinit(self):
logger.log_info("Start daemon deinit...")
Expand All @@ -1035,6 +1120,7 @@ class DaemonXcvrd(DaemonBase):
logical_port_list = platform_sfputil.logical
for logical_port_name in logical_port_list:
del_port_sfp_dom_info_from_db(logical_port_name, self.int_tbl, self.dom_tbl)
delete_port_from_status_table(logical_port_name, self.status_tbl)

# Run daemon
def run(self):
Expand All @@ -1056,7 +1142,7 @@ class DaemonXcvrd(DaemonBase):

while not self.stop_event.wait(self.timeout):
# Check the integrity of the sfp info table and recover the missing entries if any
recover_missing_sfp_table_entries(platform_sfputil, self.int_tbl, self.stop_event)
recover_missing_sfp_table_entries(platform_sfputil, self.int_tbl, self.status_tbl, self.stop_event)

logger.log_info("Stop daemon main loop")

Expand Down

0 comments on commit 238fc06

Please sign in to comment.