Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Add bitmap support for SFP error event #7605

Merged
merged 8 commits into from
Jun 25, 2021
Merged
10 changes: 7 additions & 3 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,18 +571,22 @@ def get_change_event(self, timeout=0):

wait_for_ever = (timeout == 0)
port_dict = {}
error_dict = {}
if wait_for_ever:
timeout = MAX_SELECT_DELAY
while True:
status = self.sfp_event.check_sfp_status(port_dict, timeout)
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
if bool(port_dict):
break
else:
status = self.sfp_event.check_sfp_status(port_dict, timeout)
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)

if status:
self.reinit_sfps(port_dict)
return True, {'sfp':port_dict}
result_dict = {'sfp':port_dict}
if error_dict:
result_dict['sfp_error'] = error_dict
return True, result_dict
else:
return True, {'sfp':{}}

Expand Down
92 changes: 91 additions & 1 deletion platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

try:
import subprocess
import os
from sonic_platform_base.sfp_base import SfpBase
from sonic_platform_base.sonic_eeprom import eeprom_dts
from sonic_platform_base.sonic_sfp.sff8472 import sff8472InterfaceId
Expand All @@ -35,6 +36,18 @@
except ImportError as e:
pass

try:
if os.environ["PLATFORM_API_UNIT_TESTING"] == "1":
# Unable to import SDK constants under unit test
# Define them here
SX_PORT_MODULE_STATUS_INITIALIZING = 0
SX_PORT_MODULE_STATUS_PLUGGED = 1
SX_PORT_MODULE_STATUS_UNPLUGGED = 2
SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR = 3
SX_PORT_MODULE_STATUS_PLUGGED_DISABLED = 4
except KeyError:
pass

# definitions of the offset and width for values in XCVR info eeprom
XCVR_INTFACE_BULK_OFFSET = 0
XCVR_INTFACE_BULK_WIDTH_QSFP = 20
Expand Down Expand Up @@ -330,6 +343,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
class SFP(SfpBase):
"""Platform-specific SFP class"""

SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE = 'Long range for non-Mellanox cable or module'
SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST = 'Enforce part number list'
SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled'
SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded'
SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved'

SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000
SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000
SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000
SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000
SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000

def __init__(self, sfp_index, sfp_type, sdk_handle_getter, platform):
SfpBase.__init__(self)
self.index = sfp_index + 1
Expand Down Expand Up @@ -388,7 +413,7 @@ def get_presence(self):
# Read out any bytes from any offset
def _read_eeprom_specific_bytes(self, offset, num_bytes):
eeprom_raw = []
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {}".format(self.index, offset, num_bytes)
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {} 2>/dev/null".format(self.index, offset, num_bytes)
try:
output = subprocess.check_output(ethtool_cmd,
shell=True,
Expand Down Expand Up @@ -2158,3 +2183,68 @@ def is_replaceable(self):
bool: True if it is replaceable.
"""
return True

def _get_error_code(self):
"""
Get error code of the SFP module

Returns:
The error code fetch from SDK API
"""
module_id_info_list = new_sx_mgmt_module_id_info_t_arr(1)
module_info_list = new_sx_mgmt_phy_module_info_t_arr(1)

module_id_info = sx_mgmt_module_id_info_t()
module_id_info.slot_id = 0
module_id_info.module_id = self.sdk_index
sx_mgmt_module_id_info_t_arr_setitem(module_id_info_list, 0, module_id_info)

rc = sx_mgmt_phy_module_info_get(self.sdk_handle, module_id_info_list, 1, module_info_list)
assert SX_STATUS_SUCCESS == rc, "sx_mgmt_phy_module_info_get failed, error code {}".format(rc)

mod_info = sx_mgmt_phy_module_info_t_arr_getitem(module_info_list, 0)
return mod_info.module_state.oper_state, mod_info.module_state.error_type

@classmethod
def _get_error_description_dict(cls):
return {0: cls.SFP_ERROR_DESCRIPTION_POWER_BUDGET_EXCEEDED,
1: cls.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
2: cls.SFP_ERROR_DESCRIPTION_I2C_STUCK,
3: cls.SFP_ERROR_DESCRIPTION_BAD_EEPROM,
4: cls.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
5: cls.SFP_ERROR_DESCRIPTION_UNSUPPORTED_CABLE,
6: cls.SFP_ERROR_DESCRIPTION_HIGH_TEMP,
7: cls.SFP_ERROR_DESCRIPTION_BAD_CABLE,
8: cls.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
12: cls.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED,
255: cls.SFP_MLNX_ERROR_DESCRIPTION_RESERVED
}

def get_error_description(self):
"""
Get error description

Args:
error_code: The error code returned by _get_error_code

Returns:
The error description
"""
oper_status, error_code = self._get_error_code()
if oper_status == SX_PORT_MODULE_STATUS_INITIALIZING:
error_description = self.SFP_STATUS_INITIALIZING
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED:
error_description = self.SFP_STATUS_OK
elif oper_status == SX_PORT_MODULE_STATUS_UNPLUGGED:
error_description = self.SFP_STATUS_UNPLUGGED
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_DISABLED:
error_description = self.SFP_STATUS_DISABLED
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR:
error_description_dict = self._get_error_description_dict()
if error_code in error_description_dict:
error_description = error_description_dict[error_code]
else:
error_description = "Unknown error ({})".format(error_code)
else:
error_description = "Unknow SFP module status ({})".format(oper_status)
return error_description
85 changes: 58 additions & 27 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,16 @@
import os
import time
import select
from python_sdk_api.sx_api import *
if 'MLNX_PLATFORM_API_UNIT_TESTING' not in os.environ:
from python_sdk_api.sx_api import *
else:
from mock import MagicMock
class MockSxFd(object):
fd = 99
new_sx_fd_t_p = MagicMock(return_value=MockSxFd())
new_sx_user_channel_t_p = MagicMock()
from sonic_py_common.logger import Logger
from .sfp import SFP

# SFP status from PMAOS register
# 0x1 plug in
Expand All @@ -23,15 +31,6 @@
SDK_SFP_STATE_ERR = 0x3
SDK_SFP_STATE_DIS = 0x4

# SFP status that will be handled by XCVRD
STATUS_PLUGIN = '1'
STATUS_PLUGOUT = '0'
STATUS_ERR_I2C_STUCK = '2'
STATUS_ERR_BAD_EEPROM = '3'
STATUS_ERR_UNSUPPORTED_CABLE = '4'
STATUS_ERR_HIGH_TEMP = '5'
STATUS_ERR_BAD_CABLE = '6'

# SFP status used in this file only, will not expose to XCVRD
# STATUS_ERROR will be mapped to different status according to the error code
STATUS_UNKNOWN = '-1'
Expand Down Expand Up @@ -61,19 +60,39 @@
'''

# SFP errors that will block eeprom accessing
sdk_sfp_err_type_dict = {
0x2: STATUS_ERR_I2C_STUCK,
0x3: STATUS_ERR_BAD_EEPROM,
0x5: STATUS_ERR_UNSUPPORTED_CABLE,
0x6: STATUS_ERR_HIGH_TEMP,
0x7: STATUS_ERR_BAD_CABLE
SDK_SFP_BLOCKING_ERRORS = [
0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK,
0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM,
0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP,
0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE
]

SDK_ERRORS_TO_ERROR_BITS = {
0x0: SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED,
0x1: SFP.SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE,
0x2: SFP.SFP_ERROR_BIT_I2C_STUCK,
0x3: SFP.SFP_ERROR_BIT_BAD_EEPROM,
0x4: SFP.SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST,
0x5: SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
0x6: SFP.SFP_ERROR_BIT_HIGH_TEMP,
0x7: SFP.SFP_ERROR_BIT_BAD_CABLE,
0x8: SFP.SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED,
0xc: SFP.SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED
}

SDK_ERRORS_TO_DESCRIPTION = {
0x1: SFP.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
0x4: SFP.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
0x8: SFP.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
0xc: SFP.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED
}

sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
SDK_SFP_STATE_IN: str(SFP.SFP_STATUS_BIT_INSERTED),
SDK_SFP_STATE_OUT: str(SFP.SFP_STATUS_BIT_REMOVED),
SDK_SFP_STATE_ERR: STATUS_ERROR,
SDK_SFP_STATE_DIS: STATUS_PLUGOUT,
SDK_SFP_STATE_DIS: str(SFP.SFP_STATUS_BIT_REMOVED),
}

# system level event/error
Expand Down Expand Up @@ -196,7 +215,7 @@ def deinitialize(self):
delete_sx_fd_t_p(self.rx_fd_p)
delete_sx_user_channel_t_p(self.user_channel_p)

def check_sfp_status(self, port_change, timeout):
def check_sfp_status(self, port_change, error_dict, timeout):
"""
the meaning of timeout is aligned with select.select, which has the following meaning:
0: poll, returns without blocked
Expand Down Expand Up @@ -234,6 +253,7 @@ def check_sfp_status(self, port_change, timeout):
break

sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN)
error_description = None
if sfp_state == STATUS_UNKNOWN:
# in the following sequence, STATUS_UNKNOWN can be returned.
# so we shouldn't raise exception here.
Expand All @@ -248,18 +268,29 @@ def check_sfp_status(self, port_change, timeout):

# If get SFP status error(0x3) from SDK, then need to read the error_type to get the detailed error
if sfp_state == STATUS_ERROR:
if error_type in sdk_sfp_err_type_dict.keys():
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
sfp_state = sdk_sfp_err_type_dict[error_type]
else:
# For errors don't block the eeprom accessing, we don't report it to XCVRD
logger.log_info("SFP error on port but not blocking eeprom read, error_type {}".format(error_type))
found +=1
sfp_state_bits = SDK_ERRORS_TO_ERROR_BITS.get(error_type)
if sfp_state_bits is None:
logger.log_error("Unrecognized error {} detected on ports {}".format(error_type, port_list))
found += 1
continue

if error_type in SDK_SFP_BLOCKING_ERRORS:
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING

# An error should be always set along with 'INSERTED'
sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED

# For vendor specific errors, the description should be returned as well
error_description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)

sfp_state = str(sfp_state_bits)

for port in port_list:
logger.log_info("SFP on port {} state {}".format(port, sfp_state))
port_change[port+1] = sfp_state
if error_description:
error_dict[port+1] = error_description
found += 1

return found != 0
Expand Down
43 changes: 42 additions & 1 deletion platform/mellanox/mlnx-platform-api/tests/test_sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

os.environ["PLATFORM_API_UNIT_TESTING"] = "1"

from sonic_py_common import device_info
from sonic_platform.sfp import SFP
from sonic_platform.sfp import SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED

from sonic_platform.chassis import Chassis


Expand All @@ -26,8 +29,14 @@ def mock_get_sdk_handle(self):
self.sdk_handle = 1
return self.sdk_handle


def mock_get_sfp_error_code(self):
return self.oper_code, self.error_code


device_info.get_platform = mock_get_platform
SFP._read_eeprom_specific_bytes = mock_read_eeprom_specific_bytes
SFP._get_error_code = mock_get_sfp_error_code
Chassis.get_sdk_handle = mock_get_sdk_handle


Expand Down Expand Up @@ -82,3 +91,35 @@ def test_sfp_full_initialize_without_partial():
# Verify when get_sfp is called, the SFP modules won't be initialized again
sfp1 = allsfp[0]
assert sfp1 == chassis.get_sfp(1)


def test_sfp_get_error_status():
chassis = Chassis()

# Fetch an SFP module to test
sfp = chassis.get_sfp(1)

description_dict = sfp._get_error_description_dict()

sfp.oper_code = SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR
for error in description_dict.keys():
sfp.error_code = error
description = sfp.get_error_description()

assert description == description_dict[sfp.error_code]

sfp.error_code = -1
description = sfp.get_error_description()
assert description == "Unknown error (-1)"

expected_description_list = [
(SX_PORT_MODULE_STATUS_INITIALIZING, "Initializing"),
(SX_PORT_MODULE_STATUS_PLUGGED, "OK"),
(SX_PORT_MODULE_STATUS_UNPLUGGED, "Unplugged"),
(SX_PORT_MODULE_STATUS_PLUGGED_DISABLED, "Disabled")
]
for oper_code, expected_description in expected_description_list:
sfp.oper_code = oper_code
description = sfp.get_error_description()

assert description == expected_description
46 changes: 46 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_sfp_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import select
import sys

from mock import MagicMock

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

from sonic_platform_base.sfp_base import SfpBase

class TestSfpEvent(object):
@classmethod
def setup_class(cls):
os.environ["MLNX_PLATFORM_API_UNIT_TESTING"] = "1"
select.select = MagicMock(return_value=([99], None, None))

def test_check_sfp_status(self):
from sonic_platform.sfp_event import SDK_SFP_STATE_IN, SDK_SFP_STATE_OUT, SDK_SFP_STATE_ERR
from sonic_platform.sfp_event import SDK_ERRORS_TO_ERROR_BITS, SDK_ERRORS_TO_DESCRIPTION, SDK_SFP_BLOCKING_ERRORS

self.executor(SDK_SFP_STATE_IN, None, SfpBase.SFP_STATUS_BIT_INSERTED)
self.executor(SDK_SFP_STATE_OUT, None, SfpBase.SFP_STATUS_BIT_REMOVED)
for error_type, error_status in SDK_ERRORS_TO_ERROR_BITS.items():
description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
if error_type in SDK_SFP_BLOCKING_ERRORS:
error_status |= SfpBase.SFP_ERROR_BIT_BLOCKING
error_status |= SfpBase.SFP_STATUS_BIT_INSERTED
self.executor(SDK_SFP_STATE_ERR, error_type, error_status, description)

def executor(self, mock_module_state, mock_error_type, expect_status, description=None):
from sonic_platform.sfp_event import sfp_event

event = sfp_event()
event.on_pmpe = MagicMock(return_value=(True, [0,1], mock_module_state, mock_error_type))
port_change = {}
error_dict = {}
found = event.check_sfp_status(port_change, error_dict, 0)
assert found
expect_status_str = str(expect_status)
assert 1 in port_change and port_change[1] == expect_status_str
assert 2 in port_change and port_change[2] == expect_status_str
if description:
assert 1 in error_dict and error_dict[1] == description
assert 2 in error_dict and error_dict[2] == description