Skip to content

Commit

Permalink
[Mellanox] mlnx-sfpd init flow enhancement (#3294)
Browse files Browse the repository at this point in the history
* fix sfpd initialize issue
* fix review comments
* rephrase the output log
* fix retry counter
* change the retry time to 10, means set max waiting time 1024s
* fix mlnx-sfpd init flow with new solution
* [mlnx-sfpd] address comments
1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry.
2. use try/except structure so that error can be handled in a graceful way
* [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up.
* [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
  • Loading branch information
stephenxs authored and yxieca committed Aug 14, 2019
1 parent b80d60c commit c17cd19
Showing 1 changed file with 73 additions and 31 deletions.
104 changes: 73 additions & 31 deletions platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'

SFPD_LIVENESS_EXPIRE_SECS = 30

SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'

sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
Expand Down Expand Up @@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
class MlnxSfpd:
''' Listen to plugin/plugout cable events '''

SX_OPEN_RETRIES = 20
SX_OPEN_RETRIES = 30
SX_OPEN_TIMEOUT = 5
SELECT_TIMEOUT = 1

def __init__(self):
Expand All @@ -75,7 +78,6 @@ class MlnxSfpd:
# Allocate SDK fd and user channel structures
self.rx_fd_p = new_sx_fd_t_p()
self.user_channel_p = new_sx_user_channel_t_p()

self.state_db = SonicV2Connector(host=REDIS_HOSTIP)

# Register our signal handlers
Expand All @@ -98,37 +100,78 @@ class MlnxSfpd:
def initialize(self):
self.state_db.connect("STATE_DB")

# open SDK API handle
# retry at most SX_OPEN_RETRIES times to wait
# until SDK is started during system startup
retry = 1
while True:
rc, self.handle = sx_api_open(None)
if rc == SX_STATUS_SUCCESS:
break

log_warning("failed to open SDK API handle... retrying {}".format(retry))
swid_cnt_p = None

time.sleep(2 ** retry)
retry += 1

if retry > self.SX_OPEN_RETRIES:
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))

rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
try:
# Wait for SDK daemon to be started with detect the sdk_ready file
retry = 0
while not os.path.exists(SDK_DAEMON_READY_FILE):
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
else:
log_info("SDK daemon not started yet, retry {} times".format(retry))
retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)

self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p
# to make sure SDK daemon has started
time.sleep(self.SX_OPEN_TIMEOUT)

rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
rc, self.handle = sx_api_open(None)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))

rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))

self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p

# Wait for switch to be created and initialized inside SDK
retry = 0
swid_cnt_p = new_uint32_t_p()
uint32_t_p_assign(swid_cnt_p, 0)
swid_cnt = 0
while True:
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
else:
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
if rc == SX_STATUS_SUCCESS:
swid_cnt = uint32_t_p_value(swid_cnt_p)
if swid_cnt > 0:
delete_uint32_t_p(swid_cnt_p)
swid_cnt_p = None
break
else:
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
else:
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))

retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)

# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)

if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))

self.running = True
except Exception as e:
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
if swid_cnt_p is not None:
delete_uint32_t_p(swid_cnt_p)
self.deinitialize()

def deinitialize(self):
# remove mlnx-sfpd liveness key in DB if not expired yet
Expand Down Expand Up @@ -156,7 +199,6 @@ class MlnxSfpd:
log_error("sx_api_close exited with error, rc {}".format(rc))

def run(self):
self.running = True

while self.running:
try:
Expand Down

0 comments on commit c17cd19

Please sign in to comment.