[Mellanox] mlnx-sfpd init flow enhancement (#3294)
* fix sfpd initialize issue * fix review comments * rephrase the output log * fix retry counter * change the retry time to 10, means set max waiting time 1024s * fix mlnx-sfpd init flow with new solution * [mlnx-sfpd] address comments 1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry. 2. use try/except structure so that error can be handled in a graceful way * [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up. * [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
This commit is contained in:
parent
b80d60c277
commit
c17cd19e49
@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
|
|||||||
|
|
||||||
SFPD_LIVENESS_EXPIRE_SECS = 30
|
SFPD_LIVENESS_EXPIRE_SECS = 30
|
||||||
|
|
||||||
|
SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
|
||||||
|
|
||||||
sfp_value_status_dict = {
|
sfp_value_status_dict = {
|
||||||
SDK_SFP_STATE_IN: STATUS_PLUGIN,
|
SDK_SFP_STATE_IN: STATUS_PLUGIN,
|
||||||
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
|
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
|
||||||
@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
|
|||||||
class MlnxSfpd:
|
class MlnxSfpd:
|
||||||
''' Listen to plugin/plugout cable events '''
|
''' Listen to plugin/plugout cable events '''
|
||||||
|
|
||||||
SX_OPEN_RETRIES = 20
|
SX_OPEN_RETRIES = 30
|
||||||
|
SX_OPEN_TIMEOUT = 5
|
||||||
SELECT_TIMEOUT = 1
|
SELECT_TIMEOUT = 1
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -75,7 +78,6 @@ class MlnxSfpd:
|
|||||||
# Allocate SDK fd and user channel structures
|
# Allocate SDK fd and user channel structures
|
||||||
self.rx_fd_p = new_sx_fd_t_p()
|
self.rx_fd_p = new_sx_fd_t_p()
|
||||||
self.user_channel_p = new_sx_user_channel_t_p()
|
self.user_channel_p = new_sx_user_channel_t_p()
|
||||||
|
|
||||||
self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
|
self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
|
||||||
|
|
||||||
# Register our signal handlers
|
# Register our signal handlers
|
||||||
@ -98,37 +100,78 @@ class MlnxSfpd:
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.state_db.connect("STATE_DB")
|
self.state_db.connect("STATE_DB")
|
||||||
|
|
||||||
# open SDK API handle
|
swid_cnt_p = None
|
||||||
# retry at most SX_OPEN_RETRIES times to wait
|
|
||||||
# until SDK is started during system startup
|
try:
|
||||||
retry = 1
|
# Wait for SDK daemon to be started with detect the sdk_ready file
|
||||||
while True:
|
retry = 0
|
||||||
|
while not os.path.exists(SDK_DAEMON_READY_FILE):
|
||||||
|
if retry >= self.SX_OPEN_RETRIES:
|
||||||
|
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
|
||||||
|
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
|
||||||
|
else:
|
||||||
|
log_info("SDK daemon not started yet, retry {} times".format(retry))
|
||||||
|
retry = retry + 1
|
||||||
|
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||||
|
|
||||||
|
# to make sure SDK daemon has started
|
||||||
|
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||||
|
|
||||||
|
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
|
||||||
rc, self.handle = sx_api_open(None)
|
rc, self.handle = sx_api_open(None)
|
||||||
if rc == SX_STATUS_SUCCESS:
|
if rc != SX_STATUS_SUCCESS:
|
||||||
break
|
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))
|
||||||
|
|
||||||
log_warning("failed to open SDK API handle... retrying {}".format(retry))
|
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
|
||||||
|
if rc != SX_STATUS_SUCCESS:
|
||||||
|
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))
|
||||||
|
|
||||||
time.sleep(2 ** retry)
|
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
|
||||||
retry += 1
|
self.user_channel_p.channel.fd = self.rx_fd_p
|
||||||
|
|
||||||
if retry > self.SX_OPEN_RETRIES:
|
# Wait for switch to be created and initialized inside SDK
|
||||||
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))
|
retry = 0
|
||||||
|
swid_cnt_p = new_uint32_t_p()
|
||||||
|
uint32_t_p_assign(swid_cnt_p, 0)
|
||||||
|
swid_cnt = 0
|
||||||
|
while True:
|
||||||
|
if retry >= self.SX_OPEN_RETRIES:
|
||||||
|
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
|
||||||
|
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
|
||||||
|
else:
|
||||||
|
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
|
||||||
|
if rc == SX_STATUS_SUCCESS:
|
||||||
|
swid_cnt = uint32_t_p_value(swid_cnt_p)
|
||||||
|
if swid_cnt > 0:
|
||||||
|
delete_uint32_t_p(swid_cnt_p)
|
||||||
|
swid_cnt_p = None
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
|
||||||
|
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
|
||||||
|
else:
|
||||||
|
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
|
||||||
|
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))
|
||||||
|
|
||||||
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
|
retry = retry + 1
|
||||||
if rc != SX_STATUS_SUCCESS:
|
time.sleep(self.SX_OPEN_TIMEOUT)
|
||||||
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
|
|
||||||
|
|
||||||
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
|
# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
|
||||||
self.user_channel_p.channel.fd = self.rx_fd_p
|
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
|
||||||
|
SX_ACCESS_CMD_REGISTER,
|
||||||
|
self.swid,
|
||||||
|
SX_TRAP_ID_PMPE,
|
||||||
|
self.user_channel_p)
|
||||||
|
|
||||||
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
|
if rc != SX_STATUS_SUCCESS:
|
||||||
SX_ACCESS_CMD_REGISTER,
|
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))
|
||||||
self.swid,
|
|
||||||
SX_TRAP_ID_PMPE,
|
self.running = True
|
||||||
self.user_channel_p)
|
except Exception as e:
|
||||||
if rc != SX_STATUS_SUCCESS:
|
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
|
||||||
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
|
if swid_cnt_p is not None:
|
||||||
|
delete_uint32_t_p(swid_cnt_p)
|
||||||
|
self.deinitialize()
|
||||||
|
|
||||||
def deinitialize(self):
|
def deinitialize(self):
|
||||||
# remove mlnx-sfpd liveness key in DB if not expired yet
|
# remove mlnx-sfpd liveness key in DB if not expired yet
|
||||||
@ -156,7 +199,6 @@ class MlnxSfpd:
|
|||||||
log_error("sx_api_close exited with error, rc {}".format(rc))
|
log_error("sx_api_close exited with error, rc {}".format(rc))
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.running = True
|
|
||||||
|
|
||||||
while self.running:
|
while self.running:
|
||||||
try:
|
try:
|
||||||
|
Reference in New Issue
Block a user