[202012] [Mellanox] Fix issue: SFP eeprom corrupted after replacing cable with different sfp type (#13543)

- Why I did it
There are 3 tasks in xcvrd:

main task, run a loop to recover missing SFP static information to DB every 1 minute
SFP state task, a process which listens cable plug in/out event, insert SFP static information to DB while a cable is inserted
SFP DOM update task, a thread which handles cable DOM information update every 1 minute
Let assume user replaces QSFP with QSFP-DD. There are two issues:

Only SFP state task listens cable plug in/out event, main task and SFP DOM update task does not know SFP type has changed, they still “think” the SFP type is QSFP. So, main task and SFP DOM update task uses QSFP standard to parse QSFP-DD EEPROM which causes corrupted data.
There is a race condition between main task and SFP state task. They both insert SFP static information to DB. Depends on timing, it is possible that main task using wrong SFP type to override SFP static information.
The PR is to fix these two issues.

There is no such issue on 202205 and above because there is a refactor for xcvrd:

SFP state task was changed from process to thread, so that all 3 tasks share the same memory space, they always have correct SFP type.
Recover missing SFP information logical was moved from main task to SFP state task. There is no race condition anymore.

- How I did it
It is difficult to back port latest xcvrd because there are many refactor/new features in xcvrd after 202012 release. It will be huge effort to do so. Based on that, we decided to fix the issue on Nvidia platform API side. The fix is that: refreshing SFP type before any SFP API which accessing SFP EEPROM. Refreshing SFP type before any SFP API would cause a small performance down: Due to my test on 202012 branch, accessing transceiver INFO and DOM INFO for 32 ports takes 1.7 seconds before the change. The number changes to 2.4 seconds after the change. I suppose the performance down is acceptable.

- How to verify it
Manual test
Regression
This commit is contained in:
Junchao-Mellanox 2023-02-19 15:47:32 +08:00 committed by GitHub
parent 5b889543ee
commit 7543993af3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -7,6 +7,7 @@
#############################################################################
try:
import functools
import time
from sonic_platform_base.sfp_base import SfpBase
from sonic_platform_base.sonic_eeprom import eeprom_dts
@ -323,6 +324,24 @@ def deinitialize_sdk_handle(sdk_handle):
logger.log_warning("Sdk handle is none")
return False
def refresh_sfp_type(refresh_dom_capability=True):
"""Decorator to force refreshing sfp type
Args:
refresh_dom_capability (bool, optional): refresh DOM capability. Defaults to True.
"""
def decorator(method):
@functools.wraps(method)
def _impl(self, *args, **kwargs):
self._refresh_sfp_type()
if refresh_dom_capability:
self._dom_capability_detect()
return method(self, *args, **kwargs)
return _impl
return decorator
class SFP(SfpBase):
"""Platform-specific SFP class"""
@ -332,9 +351,9 @@ class SFP(SfpBase):
self.sdk_sysfs_page_path_header = SFP_SYSFS_PATH.format(sfp_index)
self.sfp_eeprom_path = "qsfp{}".format(self.index)
self.sfp_status_path = "qsfp{}_status".format(self.index)
self._detect_sfp_type(sfp_type)
self._sfp_type = None
self._default_sfp_type = sfp_type
self.dom_tx_disable_supported = False
self._dom_capability_detect()
self.sdk_handle_getter = sdk_handle_getter
self.sdk_index = sfp_index
@ -352,7 +371,7 @@ class SFP(SfpBase):
Re-initialize this SFP object when a new SFP inserted
:return:
"""
self._detect_sfp_type(self.sfp_type)
self._refresh_sfp_type()
self._dom_capability_detect()
def get_presence(self):
@ -382,7 +401,8 @@ class SFP(SfpBase):
if page_num == 0:
page = self.sdk_sysfs_page_path_header + PATH_PAGE00
else:
if self.sfp_type == SFP_TYPE and page_num == 1:
self._refresh_sfp_type()
if page_num == 1 and self.sfp_type == SFP_TYPE:
page = self.sdk_sysfs_page_path_header + PATH_PAGE00_A2
else:
page = self.sdk_sysfs_page_path_header + PATH_PAGE.format(page_num)
@ -394,26 +414,33 @@ class SFP(SfpBase):
except (OSError, IOError):
return None
def _detect_sfp_type(self, sfp_type):
eeprom_raw = []
eeprom_raw = self._read_eeprom_specific_bytes(XCVR_TYPE_OFFSET, XCVR_TYPE_WIDTH)
if eeprom_raw:
if eeprom_raw[0] in SFP_TYPE_CODE_LIST:
self.sfp_type = SFP_TYPE
elif eeprom_raw[0] in QSFP_TYPE_CODE_LIST:
self.sfp_type = QSFP_TYPE
elif eeprom_raw[0] in QSFP_DD_TYPE_CODE_LIST:
self.sfp_type = QSFP_DD_TYPE
@property
def sfp_type(self):
if self._sfp_type is None:
eeprom_raw = []
eeprom_raw = self._read_eeprom_specific_bytes(XCVR_TYPE_OFFSET, XCVR_TYPE_WIDTH)
if eeprom_raw:
if eeprom_raw[0] in SFP_TYPE_CODE_LIST:
self._sfp_type = SFP_TYPE
elif eeprom_raw[0] in QSFP_TYPE_CODE_LIST:
self._sfp_type = QSFP_TYPE
elif eeprom_raw[0] in QSFP_DD_TYPE_CODE_LIST:
self._sfp_type = QSFP_DD_TYPE
else:
# we don't recognize this identifier value, treat the xSFP module as the default type
self._sfp_type = self._default_sfp_type
logger.log_info("Identifier value of {} module {} is {} which isn't recognized and will be treated as default type ({})".format(
self._default_sfp_type, self.index, eeprom_raw[0], self._default_sfp_type
))
else:
# we don't regonize this identifier value, treat the xSFP module as the default type
self.sfp_type = sfp_type
logger.log_info("Identifier value of {} module {} is {} which isn't regonized and will be treated as default type ({})".format(
sfp_type, self.index, eeprom_raw[0], sfp_type
))
else:
# eeprom_raw being None indicates the module is not present.
# in this case we treat it as the default type according to the SKU
self.sfp_type = sfp_type
# eeprom_raw being None indicates the module is not present.
# in this case we treat it as the default type according to the SKU
self._sfp_type = self._default_sfp_type
self._default_sfp_type = self._sfp_type
return self._sfp_type
def _refresh_sfp_type(self):
self._sfp_type = None
def _is_qsfp_copper(self):
# This function read the specification compliance byte and
@ -537,6 +564,7 @@ class SFP(SfpBase):
self.dom_supported = False
self.dom_temp_supported = False
self.dom_volt_supported = False
self.second_application_list = False
self.dom_rx_power_supported = False
self.dom_tx_power_supported = False
self.dom_tx_bias_power_supported = False
@ -595,7 +623,7 @@ class SFP(SfpBase):
else:
return 'N/A'
@refresh_sfp_type()
def get_transceiver_info(self):
"""
Retrieves transceiver info of this SFP
@ -900,7 +928,7 @@ class SFP(SfpBase):
return transceiver_info_dict
@refresh_sfp_type()
def get_transceiver_bulk_status(self):
"""
Retrieves transceiver bulk status of this SFP
@ -1100,7 +1128,7 @@ class SFP(SfpBase):
return transceiver_dom_info_dict
@refresh_sfp_type()
def get_transceiver_threshold_info(self):
"""
Retrieves transceiver threshold info of this SFP
@ -1279,7 +1307,7 @@ class SFP(SfpBase):
return transceiver_dom_threshold_info_dict
@refresh_sfp_type()
def get_reset_status(self):
"""
Retrieves the reset status of SFP
@ -1332,6 +1360,7 @@ class SFP(SfpBase):
dom_channel_status_data = sfpd_obj.parse_dom_channel_status(dom_channel_status_raw, 0)
return dom_channel_status_data['data']['Status']['value'] == 'On'
@refresh_sfp_type()
def get_rx_los(self):
"""
Retrieves the RX LOS (lost-of-signal) status of SFP
@ -1364,7 +1393,7 @@ class SFP(SfpBase):
rx_los_list.append(False)
return rx_los_list
@refresh_sfp_type()
def get_tx_fault(self):
"""
Retrieves the TX fault status of SFP
@ -1397,7 +1426,7 @@ class SFP(SfpBase):
tx_fault_list.append(False)
return tx_fault_list
@refresh_sfp_type()
def get_tx_disable(self):
"""
Retrieves the tx_disable status of this SFP
@ -1500,7 +1529,7 @@ class SFP(SfpBase):
return oper_pwr_mode == SX_MGMT_PHY_MOD_PWR_MODE_LOW_E
@refresh_sfp_type(refresh_dom_capability=True)
def get_power_override(self):
"""
Retrieves the power-override status of this SFP
@ -1521,7 +1550,7 @@ class SFP(SfpBase):
else:
return NotImplementedError
@refresh_sfp_type()
def get_temperature(self):
"""
Retrieves the temperature of this SFP
@ -1579,7 +1608,7 @@ class SFP(SfpBase):
else:
return None
@refresh_sfp_type()
def get_voltage(self):
"""
Retrieves the supply voltage of this SFP
@ -1638,7 +1667,7 @@ class SFP(SfpBase):
else:
return None
@refresh_sfp_type()
def get_tx_bias(self):
"""
Retrieves the TX bias current of this SFP
@ -1672,7 +1701,7 @@ class SFP(SfpBase):
if sfpd_obj is None:
return None
if dom_tx_bias_power_supported:
if self.dom_tx_bias_power_supported:
dom_tx_bias_raw = self._read_eeprom_specific_bytes((offset + QSFP_DD_TX_BIAS_OFFSET), QSFP_DD_TX_BIAS_WIDTH)
if dom_tx_bias_raw is not None:
dom_tx_bias_data = sfpd_obj.parse_dom_tx_bias(dom_tx_bias_raw, 0)
@ -1705,7 +1734,7 @@ class SFP(SfpBase):
return tx_bias_list
@refresh_sfp_type()
def get_rx_power(self):
"""
Retrieves the received optical power for this SFP
@ -1781,7 +1810,7 @@ class SFP(SfpBase):
return None
return rx_power_list
@refresh_sfp_type()
def get_tx_power(self):
"""
Retrieves the TX power of this SFP