[Mellanox] mlnx-sfpd init flow enhancement (#3294)

* fix sfpd initialize issue
* fix review comments
* rephrase the output log
* fix retry counter
* change the retry time to 10, means set max waiting time 1024s
* fix mlnx-sfpd init flow with new solution
* [mlnx-sfpd] address comments
1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry.
2. use try/except structure so that error can be handled in a graceful way
* [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up.
* [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
This commit is contained in:
Stephen Sun 2019-08-09 01:17:48 +08:00 committed by Ying Xie
parent b80d60c277
commit c17cd19e49

View File

@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
SFPD_LIVENESS_EXPIRE_SECS = 30
SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
class MlnxSfpd:
''' Listen to plugin/plugout cable events '''
SX_OPEN_RETRIES = 20
SX_OPEN_RETRIES = 30
SX_OPEN_TIMEOUT = 5
SELECT_TIMEOUT = 1
def __init__(self):
@ -75,7 +78,6 @@ class MlnxSfpd:
# Allocate SDK fd and user channel structures
self.rx_fd_p = new_sx_fd_t_p()
self.user_channel_p = new_sx_user_channel_t_p()
self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
# Register our signal handlers
@ -98,37 +100,78 @@ class MlnxSfpd:
def initialize(self):
self.state_db.connect("STATE_DB")
# open SDK API handle
# retry at most SX_OPEN_RETRIES times to wait
# until SDK is started during system startup
retry = 1
while True:
swid_cnt_p = None
try:
# Wait for SDK daemon to be started with detect the sdk_ready file
retry = 0
while not os.path.exists(SDK_DAEMON_READY_FILE):
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
else:
log_info("SDK daemon not started yet, retry {} times".format(retry))
retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)
# to make sure SDK daemon has started
time.sleep(self.SX_OPEN_TIMEOUT)
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
rc, self.handle = sx_api_open(None)
if rc == SX_STATUS_SUCCESS:
break
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))
log_warning("failed to open SDK API handle... retrying {}".format(retry))
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))
time.sleep(2 ** retry)
retry += 1
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p
if retry > self.SX_OPEN_RETRIES:
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))
# Wait for switch to be created and initialized inside SDK
retry = 0
swid_cnt_p = new_uint32_t_p()
uint32_t_p_assign(swid_cnt_p, 0)
swid_cnt = 0
while True:
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
else:
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
if rc == SX_STATUS_SUCCESS:
swid_cnt = uint32_t_p_value(swid_cnt_p)
if swid_cnt > 0:
delete_uint32_t_p(swid_cnt_p)
swid_cnt_p = None
break
else:
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
else:
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p
# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))
self.running = True
except Exception as e:
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
if swid_cnt_p is not None:
delete_uint32_t_p(swid_cnt_p)
self.deinitialize()
def deinitialize(self):
# remove mlnx-sfpd liveness key in DB if not expired yet
@ -156,7 +199,6 @@ class MlnxSfpd:
log_error("sx_api_close exited with error, rc {}".format(rc))
def run(self):
self.running = True
while self.running:
try: