[202111] [Mellanox] Add retry when SFP object is not initialized correctly due to eeprom read failure (#11265)

- Why I did it
During platform API SFP object initialization, there are two steps, one is to read the xSFP type from EEPROM, and another is to parse the xSFP DOM support capability. There is the possibility that the xSFP EEPROM is not ready when it started to read, which will result in the SFP object does not have type and DOM capability correctly initialized, which will cause further issues. So need to add a mechanism to retry in this case.

- How I did it
Add flags to indicate whether the SFP object has been correctly initialized or not, set the flag when an error happened and after all relevant bytes from EEPROM are correctly read out and parsed, clear the flag.
Leverage the Python decorator to decorate the related functions, each time when the related function is called the decorator will check whether the SFP object has been correctly initialized or not, if not it will read the EEPROM and parse again.

- How to verify it
Run SFP-related platform tests to make sure no new issue is introduced.

Signed-off-by: Kebo Liu <kebol@nvidia.com>
This commit is contained in:
Kebo Liu 2022-07-03 20:59:51 +08:00 committed by GitHub
parent 6ab339482f
commit 27839792d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 86 additions and 17 deletions

View File

@ -369,7 +369,8 @@ class SfpCapability:
self.calibration = 0
self.qsfp_page3_available = False
self.second_application_list = False
self.dom_detect_finished = False
class SFP(SfpBase):
"""Platform-specific SFP class"""
@ -417,11 +418,11 @@ class SFP(SfpBase):
logger.log_error('Failed to open SDK handle')
return SFP.shared_sdk_handle
@property
def sfp_type(self):
def _detect_sfp_type(self):
if not self._sfp_type:
eeprom_raw = []
eeprom_raw = self._read_eeprom_specific_bytes(XCVR_TYPE_OFFSET, XCVR_TYPE_WIDTH)
if eeprom_raw:
if eeprom_raw[0] in SFP_TYPE_CODE_LIST:
self._sfp_type = SFP_TYPE
@ -430,14 +431,13 @@ class SFP(SfpBase):
elif eeprom_raw[0] in QSFP_DD_TYPE_CODE_LIST:
self._sfp_type = QSFP_DD_TYPE
# we don't regonize this identifier value, treat the xSFP module as the default type
if not self._sfp_type:
raise RuntimeError("Failed to detect SFP type for SFP {}".format(self.index))
else:
return self._sfp_type
@property
@utils.pre_initialize(_detect_sfp_type)
def sfp_type(self):
return self._sfp_type
def _dom_capability_detect(self):
if self._sfp_capability:
if self._sfp_capability and self._sfp_capability.dom_detect_finished:
return
self._sfp_capability = SfpCapability()
@ -456,8 +456,9 @@ class SFP(SfpBase):
# need to add more code for determining the capability and version compliance
# in SFF-8636 dom capability definitions evolving with the versions.
qsfp_dom_capability_raw = self._read_eeprom_specific_bytes((offset + XCVR_DOM_CAPABILITY_OFFSET), XCVR_DOM_CAPABILITY_WIDTH)
if qsfp_dom_capability_raw is not None:
qsfp_version_compliance_raw = self._read_eeprom_specific_bytes(QSFP_VERSION_COMPLIANCE_OFFSET, QSFP_VERSION_COMPLIANCE_WIDTH)
qsfp_version_compliance_raw = self._read_eeprom_specific_bytes(QSFP_VERSION_COMPLIANCE_OFFSET, QSFP_VERSION_COMPLIANCE_WIDTH)
qsfp_option_value_raw = self._read_eeprom_specific_bytes(QSFP_OPTION_VALUE_OFFSET, QSFP_OPTION_VALUE_WIDTH)
if None not in (qsfp_dom_capability_raw, qsfp_version_compliance_raw, qsfp_option_value_raw):
qsfp_version_compliance = int(qsfp_version_compliance_raw[0], 16)
dom_capability = sfpi_obj.parse_dom_capability(qsfp_dom_capability_raw, 0)
if qsfp_version_compliance >= 0x08:
@ -475,13 +476,14 @@ class SFP(SfpBase):
sfpd_obj = sff8436Dom()
if sfpd_obj is None:
return None
qsfp_option_value_raw = self._read_eeprom_specific_bytes(QSFP_OPTION_VALUE_OFFSET, QSFP_OPTION_VALUE_WIDTH)
if qsfp_option_value_raw is not None:
optional_capability = sfpd_obj.parse_option_params(qsfp_option_value_raw, 0)
self._sfp_capability.dom_tx_disable_supported = optional_capability['data']['TxDisable']['value'] == 'On'
optional_capability = sfpd_obj.parse_option_params(qsfp_option_value_raw, 0)
self._sfp_capability.dom_tx_disable_supported = optional_capability['data']['TxDisable']['value'] == 'On'
dom_status_indicator = sfpd_obj.parse_dom_status_indicator(qsfp_version_compliance_raw, 1)
self._sfp_capability.qsfp_page3_available = dom_status_indicator['data']['FlatMem']['value'] == 'Off'
self._sfp_capability.dom_detect_finished = True
else:
logger.log_warning("SFP {}: Dom capabilty parsing is failed due to eeprom read fail, will re-try next time.".format(self.index))
self._sfp_capability.dom_supported = False
self._sfp_capability.dom_temp_supported = False
self._sfp_capability.dom_volt_supported = False
@ -489,6 +491,7 @@ class SFP(SfpBase):
self._sfp_capability.dom_tx_power_supported = False
self._sfp_capability.calibration = 0
self._sfp_capability.qsfp_page3_available = False
self._sfp_capability.dom_detect_finished = False
elif self.sfp_type == QSFP_DD_TYPE:
sfpi_obj = qsfp_dd_InterfaceId()
@ -518,6 +521,7 @@ class SFP(SfpBase):
self._sfp_capability.dom_tx_bias_power_supported = False
self._sfp_capability.dom_thresholds_supported = False
self._sfp_capability.dom_rx_tx_power_bias_supported = False
self._sfp_capability.dom_detect_finished = True
else:
self._sfp_capability.dom_supported = False
self._sfp_capability.dom_temp_supported = False
@ -527,6 +531,8 @@ class SFP(SfpBase):
self._sfp_capability.dom_tx_bias_power_supported = False
self._sfp_capability.dom_thresholds_supported = False
self._sfp_capability.dom_rx_tx_power_bias_supported = False
self._sfp_capability.dom_detect_finished = False
logger.log_warning("SFP {}: Dom capabilty parsing is failed due to eeprom read fail, will re-try next time.".format(self.index))
elif self.sfp_type == SFP_TYPE:
sfpi_obj = sff8472InterfaceId()
@ -554,12 +560,15 @@ class SFP(SfpBase):
self._sfp_capability.dom_tx_power_supported = False
self._sfp_capability.calibration = 0
self._sfp_capability.dom_tx_disable_supported = (int(sfp_dom_capability_raw[1], 16) & 0x40 != 0)
self._sfp_capability.dom_detect_finished = True
else:
self._sfp_capability.dom_supported = False
self._sfp_capability.dom_temp_supported = False
self._sfp_capability.dom_volt_supported = False
self._sfp_capability.dom_rx_power_supported = False
self._sfp_capability.dom_tx_power_supported = False
self._sfp_capability.dom_detect_finished = False
logger.log_warning("SFP {}: Dom capabilty parsing is failed due to sfp type is not one of the supported ones, will re-try next time.".format(self.index))
@property
@utils.pre_initialize(_dom_capability_detect)
@ -907,7 +916,7 @@ class SFP(SfpBase):
transceiver_info_dict['nominal_bit_rate'] = "Not supported for CMIS cables"
transceiver_info_dict['application_advertisement'] = host_media_list
else:
elif self.sfp_type == SFP_TYPE:
offset = 0
vendor_rev_width = XCVR_HW_REV_WIDTH_SFP
interface_info_bulk_width = XCVR_INTFACE_BULK_WIDTH_SFP
@ -916,6 +925,10 @@ class SFP(SfpBase):
if sfpi_obj is None:
print("Error: sfp_object open failed")
return None
else:
# None of any supported SFP type, could be SFP object not correctly initialized.
logger.log_warning("SFP {}: type is not one the supported type, or SFP object initialization is not finished yet.".format(self.index))
return None
if self.sfp_type != QSFP_DD_TYPE:
sfp_interface_bulk_raw = self._read_eeprom_specific_bytes(offset + XCVR_INTERFACE_DATA_START, XCVR_INTERFACE_DATA_SIZE)

View File

@ -25,9 +25,10 @@ test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from sonic_platform.sfp import SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
from sonic_platform.sfp import QSFP_TYPE, QSFP_DD_TYPE, SFP_TYPE, SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
from sonic_platform.chassis import Chassis
class TestSfp:
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_linecard_count', mock.MagicMock(return_value=8))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_linecard_max_port_count')
@ -80,3 +81,58 @@ class TestSfp:
description = sfp.get_error_description()
assert description == expected_description
def test_detect_sfp_type(self):
sfp = SFP(0)
assert sfp._sfp_type == None
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['03'])
assert sfp.sfp_type == SFP_TYPE
sfp._sfp_type = None
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['0d'])
assert sfp.sfp_type == QSFP_TYPE
sfp._sfp_type = None
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['18'])
assert sfp.sfp_type == QSFP_DD_TYPE
def test_detect_dom_capability(self):
sfp = SFP(0)
sfp.get_presence = mock.MagicMock(return_value = True)
# QSFP postive flow
sfp._sfp_type = QSFP_TYPE
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['19', '08', 'ff', '04'])
assert sfp.dom_supported == True
assert sfp._sfp_capability.dom_detect_finished == True
# QSFP negative flow
sfp._sfp_capability.dom_detect_finished = False
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = None)
assert sfp.dom_supported == False
assert sfp._sfp_capability.dom_detect_finished == False
# SFP postive flow
sfp._sfp_type = SFP_TYPE
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['ff', '00'])
assert sfp.dom_supported == True
assert sfp._sfp_capability.dom_detect_finished == True
# SFP negative flow
sfp._sfp_capability.dom_detect_finished = False
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = None)
assert sfp.dom_supported == False
assert sfp._sfp_capability.dom_detect_finished == False
# QSFPDD postive flow
sfp._sfp_type = QSFP_DD_TYPE
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = ['00'])
assert sfp.dom_supported == True
assert sfp._sfp_capability.dom_detect_finished == True
# QSFPDD negative flow
sfp._sfp_capability.dom_detect_finished = False
sfp._read_eeprom_specific_bytes = mock.MagicMock(return_value = None)
assert sfp.dom_supported == False
assert sfp._sfp_capability.dom_detect_finished == False