[Mellanox] mlnx-sfpd init flow enhancement (#3294)
* fix sfpd initialize issue * fix review comments * rephrase the output log * fix retry counter * change the retry time to 10, means set max waiting time 1024s * fix mlnx-sfpd init flow with new solution * [mlnx-sfpd] address comments 1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry. 2. use try/except structure so that error can be handled in a graceful way * [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up. * [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
This commit is contained in:
parent
b80d60c277
commit
c17cd19e49
@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
|
||||
|
||||
SFPD_LIVENESS_EXPIRE_SECS = 30
|
||||
|
||||
SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
|
||||
|
||||
sfp_value_status_dict = {
|
||||
SDK_SFP_STATE_IN: STATUS_PLUGIN,
|
||||
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
|
||||
@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
|
||||
class MlnxSfpd:
|
||||
''' Listen to plugin/plugout cable events '''
|
||||
|
||||
SX_OPEN_RETRIES = 20
|
||||
SX_OPEN_RETRIES = 30
|
||||
SX_OPEN_TIMEOUT = 5
|
||||
SELECT_TIMEOUT = 1
|
||||
|
||||
def __init__(self):
|
||||
@ -75,7 +78,6 @@ class MlnxSfpd:
|
||||
# Allocate SDK fd and user channel structures
|
||||
self.rx_fd_p = new_sx_fd_t_p()
|
||||
self.user_channel_p = new_sx_user_channel_t_p()
|
||||
|
||||
self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
|
||||
|
||||
# Register our signal handlers
|
||||
@ -98,37 +100,78 @@ class MlnxSfpd:
|
||||
def initialize(self):
|
||||
self.state_db.connect("STATE_DB")
|
||||
|
||||
# open SDK API handle
|
||||
# retry at most SX_OPEN_RETRIES times to wait
|
||||
# until SDK is started during system startup
|
||||
retry = 1
|
||||
while True:
|
||||
swid_cnt_p = None
|
||||
|
||||
try:
|
||||
# Wait for SDK daemon to be started with detect the sdk_ready file
|
||||
retry = 0
|
||||
while not os.path.exists(SDK_DAEMON_READY_FILE):
|
||||
if retry >= self.SX_OPEN_RETRIES:
|
||||
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
|
||||
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
|
||||
else:
|
||||
log_info("SDK daemon not started yet, retry {} times".format(retry))
|
||||
retry = retry + 1
|
||||
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||
|
||||
# to make sure SDK daemon has started
|
||||
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||
|
||||
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
|
||||
rc, self.handle = sx_api_open(None)
|
||||
if rc == SX_STATUS_SUCCESS:
|
||||
break
|
||||
if rc != SX_STATUS_SUCCESS:
|
||||
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))
|
||||
|
||||
log_warning("failed to open SDK API handle... retrying {}".format(retry))
|
||||
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
|
||||
if rc != SX_STATUS_SUCCESS:
|
||||
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))
|
||||
|
||||
time.sleep(2 ** retry)
|
||||
retry += 1
|
||||
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
|
||||
self.user_channel_p.channel.fd = self.rx_fd_p
|
||||
|
||||
if retry > self.SX_OPEN_RETRIES:
|
||||
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))
|
||||
# Wait for switch to be created and initialized inside SDK
|
||||
retry = 0
|
||||
swid_cnt_p = new_uint32_t_p()
|
||||
uint32_t_p_assign(swid_cnt_p, 0)
|
||||
swid_cnt = 0
|
||||
while True:
|
||||
if retry >= self.SX_OPEN_RETRIES:
|
||||
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
|
||||
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
|
||||
else:
|
||||
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
|
||||
if rc == SX_STATUS_SUCCESS:
|
||||
swid_cnt = uint32_t_p_value(swid_cnt_p)
|
||||
if swid_cnt > 0:
|
||||
delete_uint32_t_p(swid_cnt_p)
|
||||
swid_cnt_p = None
|
||||
break
|
||||
else:
|
||||
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
|
||||
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
|
||||
else:
|
||||
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
|
||||
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))
|
||||
|
||||
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
|
||||
if rc != SX_STATUS_SUCCESS:
|
||||
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
|
||||
retry = retry + 1
|
||||
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||
|
||||
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
|
||||
self.user_channel_p.channel.fd = self.rx_fd_p
|
||||
# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
|
||||
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
|
||||
SX_ACCESS_CMD_REGISTER,
|
||||
self.swid,
|
||||
SX_TRAP_ID_PMPE,
|
||||
self.user_channel_p)
|
||||
|
||||
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
|
||||
SX_ACCESS_CMD_REGISTER,
|
||||
self.swid,
|
||||
SX_TRAP_ID_PMPE,
|
||||
self.user_channel_p)
|
||||
if rc != SX_STATUS_SUCCESS:
|
||||
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
|
||||
if rc != SX_STATUS_SUCCESS:
|
||||
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))
|
||||
|
||||
self.running = True
|
||||
except Exception as e:
|
||||
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
|
||||
if swid_cnt_p is not None:
|
||||
delete_uint32_t_p(swid_cnt_p)
|
||||
self.deinitialize()
|
||||
|
||||
def deinitialize(self):
|
||||
# remove mlnx-sfpd liveness key in DB if not expired yet
|
||||
@ -156,7 +199,6 @@ class MlnxSfpd:
|
||||
log_error("sx_api_close exited with error, rc {}".format(rc))
|
||||
|
||||
def run(self):
|
||||
self.running = True
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
|
Loading…
Reference in New Issue
Block a user