[Mellanox] mlnx-sfpd init flow enhancement (#3294)

* fix sfpd initialize issue
* fix review comments
* rephrase the output log
* fix retry counter
* change the retry time to 10, means set max waiting time 1024s
* fix mlnx-sfpd init flow with new solution
* [mlnx-sfpd] address comments
1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry.
2. use try/except structure so that error can be handled in a graceful way
* [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up.
* [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
This commit is contained in:
Stephen Sun 2019-08-09 01:17:48 +08:00 committed by Ying Xie
parent b80d60c277
commit c17cd19e49

View File

@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
SFPD_LIVENESS_EXPIRE_SECS = 30 SFPD_LIVENESS_EXPIRE_SECS = 30
SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
sfp_value_status_dict = { sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN, SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT, SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
class MlnxSfpd: class MlnxSfpd:
''' Listen to plugin/plugout cable events ''' ''' Listen to plugin/plugout cable events '''
SX_OPEN_RETRIES = 20 SX_OPEN_RETRIES = 30
SX_OPEN_TIMEOUT = 5
SELECT_TIMEOUT = 1 SELECT_TIMEOUT = 1
def __init__(self): def __init__(self):
@ -75,7 +78,6 @@ class MlnxSfpd:
# Allocate SDK fd and user channel structures # Allocate SDK fd and user channel structures
self.rx_fd_p = new_sx_fd_t_p() self.rx_fd_p = new_sx_fd_t_p()
self.user_channel_p = new_sx_user_channel_t_p() self.user_channel_p = new_sx_user_channel_t_p()
self.state_db = SonicV2Connector(host=REDIS_HOSTIP) self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
# Register our signal handlers # Register our signal handlers
@ -98,37 +100,78 @@ class MlnxSfpd:
def initialize(self): def initialize(self):
self.state_db.connect("STATE_DB") self.state_db.connect("STATE_DB")
# open SDK API handle swid_cnt_p = None
# retry at most SX_OPEN_RETRIES times to wait
# until SDK is started during system startup try:
retry = 1 # Wait for SDK daemon to be started with detect the sdk_ready file
while True: retry = 0
while not os.path.exists(SDK_DAEMON_READY_FILE):
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
else:
log_info("SDK daemon not started yet, retry {} times".format(retry))
retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)
# to make sure SDK daemon has started
time.sleep(self.SX_OPEN_TIMEOUT)
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
rc, self.handle = sx_api_open(None) rc, self.handle = sx_api_open(None)
if rc == SX_STATUS_SUCCESS: if rc != SX_STATUS_SUCCESS:
break raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))
log_warning("failed to open SDK API handle... retrying {}".format(retry)) rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))
time.sleep(2 ** retry) self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
retry += 1 self.user_channel_p.channel.fd = self.rx_fd_p
if retry > self.SX_OPEN_RETRIES: # Wait for switch to be created and initialized inside SDK
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry)) retry = 0
swid_cnt_p = new_uint32_t_p()
uint32_t_p_assign(swid_cnt_p, 0)
swid_cnt = 0
while True:
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
else:
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
if rc == SX_STATUS_SUCCESS:
swid_cnt = uint32_t_p_value(swid_cnt_p)
if swid_cnt > 0:
delete_uint32_t_p(swid_cnt_p)
swid_cnt_p = None
break
else:
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
else:
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) retry = retry + 1
if rc != SX_STATUS_SUCCESS: time.sleep(self.SX_OPEN_TIMEOUT)
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD # After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
self.user_channel_p.channel.fd = self.rx_fd_p rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)
rc = sx_api_host_ifc_trap_id_register_set(self.handle, if rc != SX_STATUS_SUCCESS:
SX_ACCESS_CMD_REGISTER, raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))
self.swid,
SX_TRAP_ID_PMPE, self.running = True
self.user_channel_p) except Exception as e:
if rc != SX_STATUS_SUCCESS: log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c)) if swid_cnt_p is not None:
delete_uint32_t_p(swid_cnt_p)
self.deinitialize()
def deinitialize(self): def deinitialize(self):
# remove mlnx-sfpd liveness key in DB if not expired yet # remove mlnx-sfpd liveness key in DB if not expired yet
@ -156,7 +199,6 @@ class MlnxSfpd:
log_error("sx_api_close exited with error, rc {}".format(rc)) log_error("sx_api_close exited with error, rc {}".format(rc))
def run(self): def run(self):
self.running = True
while self.running: while self.running:
try: try: