[Mellanox] Add bitmap support for SFP error event (#7605)

#### Why I did it

Currently, SONiC use a single value to represent SFP error, however, multiple SFP errors could exist at the same time. This PR is aimed to support it

#### How I did it

Return bitmap instead of single value when a SFP event occurs

Signed-off-by: Stephen Sun <stephens@nvidia.com>
This commit is contained in:
Junchao-Mellanox 2021-06-26 01:56:47 +08:00 committed by GitHub
parent 0a92ce18cf
commit 147bf240f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 244 additions and 32 deletions

View File

@ -569,18 +569,22 @@ class Chassis(ChassisBase):
wait_for_ever = (timeout == 0)
port_dict = {}
error_dict = {}
if wait_for_ever:
timeout = MAX_SELECT_DELAY
while True:
status = self.sfp_event.check_sfp_status(port_dict, timeout)
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
if bool(port_dict):
break
else:
status = self.sfp_event.check_sfp_status(port_dict, timeout)
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
if status:
self.reinit_sfps(port_dict)
return True, {'sfp':port_dict}
result_dict = {'sfp':port_dict}
if error_dict:
result_dict['sfp_error'] = error_dict
return True, result_dict
else:
return True, {'sfp':{}}

View File

@ -8,6 +8,7 @@
try:
import subprocess
import os
from sonic_platform_base.sfp_base import SfpBase
from sonic_platform_base.sonic_eeprom import eeprom_dts
from sonic_platform_base.sonic_sfp.sff8472 import sff8472InterfaceId
@ -33,6 +34,18 @@ try:
except ImportError as e:
pass
try:
if os.environ["PLATFORM_API_UNIT_TESTING"] == "1":
# Unable to import SDK constants under unit test
# Define them here
SX_PORT_MODULE_STATUS_INITIALIZING = 0
SX_PORT_MODULE_STATUS_PLUGGED = 1
SX_PORT_MODULE_STATUS_UNPLUGGED = 2
SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR = 3
SX_PORT_MODULE_STATUS_PLUGGED_DISABLED = 4
except KeyError:
pass
# definitions of the offset and width for values in XCVR info eeprom
XCVR_INTFACE_BULK_OFFSET = 0
XCVR_INTFACE_BULK_WIDTH_QSFP = 20
@ -328,6 +341,18 @@ class SdkHandleContext(object):
class SFP(SfpBase):
"""Platform-specific SFP class"""
SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE = 'Long range for non-Mellanox cable or module'
SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST = 'Enforce part number list'
SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled'
SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded'
SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved'
SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000
SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000
SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000
SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000
SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000
def __init__(self, sfp_index, sfp_type, sdk_handle_getter, platform):
SfpBase.__init__(self)
self.index = sfp_index + 1
@ -386,7 +411,7 @@ class SFP(SfpBase):
# Read out any bytes from any offset
def _read_eeprom_specific_bytes(self, offset, num_bytes):
eeprom_raw = []
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {}".format(self.index, offset, num_bytes)
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {} 2>/dev/null".format(self.index, offset, num_bytes)
try:
output = subprocess.check_output(ethtool_cmd,
shell=True,
@ -2165,3 +2190,68 @@ class SFP(SfpBase):
bool: True if it is replaceable.
"""
return True
def _get_error_code(self):
"""
Get error code of the SFP module
Returns:
The error code fetch from SDK API
"""
module_id_info_list = new_sx_mgmt_module_id_info_t_arr(1)
module_info_list = new_sx_mgmt_phy_module_info_t_arr(1)
module_id_info = sx_mgmt_module_id_info_t()
module_id_info.slot_id = 0
module_id_info.module_id = self.sdk_index
sx_mgmt_module_id_info_t_arr_setitem(module_id_info_list, 0, module_id_info)
rc = sx_mgmt_phy_module_info_get(self.sdk_handle, module_id_info_list, 1, module_info_list)
assert SX_STATUS_SUCCESS == rc, "sx_mgmt_phy_module_info_get failed, error code {}".format(rc)
mod_info = sx_mgmt_phy_module_info_t_arr_getitem(module_info_list, 0)
return mod_info.module_state.oper_state, mod_info.module_state.error_type
@classmethod
def _get_error_description_dict(cls):
return {0: cls.SFP_ERROR_DESCRIPTION_POWER_BUDGET_EXCEEDED,
1: cls.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
2: cls.SFP_ERROR_DESCRIPTION_I2C_STUCK,
3: cls.SFP_ERROR_DESCRIPTION_BAD_EEPROM,
4: cls.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
5: cls.SFP_ERROR_DESCRIPTION_UNSUPPORTED_CABLE,
6: cls.SFP_ERROR_DESCRIPTION_HIGH_TEMP,
7: cls.SFP_ERROR_DESCRIPTION_BAD_CABLE,
8: cls.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
12: cls.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED,
255: cls.SFP_MLNX_ERROR_DESCRIPTION_RESERVED
}
def get_error_description(self):
"""
Get error description
Args:
error_code: The error code returned by _get_error_code
Returns:
The error description
"""
oper_status, error_code = self._get_error_code()
if oper_status == SX_PORT_MODULE_STATUS_INITIALIZING:
error_description = self.SFP_STATUS_INITIALIZING
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED:
error_description = self.SFP_STATUS_OK
elif oper_status == SX_PORT_MODULE_STATUS_UNPLUGGED:
error_description = self.SFP_STATUS_UNPLUGGED
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_DISABLED:
error_description = self.SFP_STATUS_DISABLED
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR:
error_description_dict = self._get_error_description_dict()
if error_code in error_description_dict:
error_description = error_description_dict[error_code]
else:
error_description = "Unknown error ({})".format(error_code)
else:
error_description = "Unknow SFP module status ({})".format(oper_status)
return error_description

View File

@ -7,8 +7,16 @@ import sys, errno
import os
import time
import select
from python_sdk_api.sx_api import *
if 'MLNX_PLATFORM_API_UNIT_TESTING' not in os.environ:
from python_sdk_api.sx_api import *
else:
from mock import MagicMock
class MockSxFd(object):
fd = 99
new_sx_fd_t_p = MagicMock(return_value=MockSxFd())
new_sx_user_channel_t_p = MagicMock()
from sonic_py_common.logger import Logger
from .sfp import SFP
# SFP status from PMAOS register
# 0x1 plug in
@ -22,15 +30,6 @@ SDK_SFP_STATE_OUT = 0x2
SDK_SFP_STATE_ERR = 0x3
SDK_SFP_STATE_DIS = 0x4
# SFP status that will be handled by XCVRD
STATUS_PLUGIN = '1'
STATUS_PLUGOUT = '0'
STATUS_ERR_I2C_STUCK = '2'
STATUS_ERR_BAD_EEPROM = '3'
STATUS_ERR_UNSUPPORTED_CABLE = '4'
STATUS_ERR_HIGH_TEMP = '5'
STATUS_ERR_BAD_CABLE = '6'
# SFP status used in this file only, will not expose to XCVRD
# STATUS_ERROR will be mapped to different status according to the error code
STATUS_UNKNOWN = '-1'
@ -60,19 +59,39 @@ STATUS_ERROR = '-2'
'''
# SFP errors that will block eeprom accessing
sdk_sfp_err_type_dict = {
0x2: STATUS_ERR_I2C_STUCK,
0x3: STATUS_ERR_BAD_EEPROM,
0x5: STATUS_ERR_UNSUPPORTED_CABLE,
0x6: STATUS_ERR_HIGH_TEMP,
0x7: STATUS_ERR_BAD_CABLE
SDK_SFP_BLOCKING_ERRORS = [
0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK,
0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM,
0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP,
0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE
]
SDK_ERRORS_TO_ERROR_BITS = {
0x0: SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED,
0x1: SFP.SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE,
0x2: SFP.SFP_ERROR_BIT_I2C_STUCK,
0x3: SFP.SFP_ERROR_BIT_BAD_EEPROM,
0x4: SFP.SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST,
0x5: SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
0x6: SFP.SFP_ERROR_BIT_HIGH_TEMP,
0x7: SFP.SFP_ERROR_BIT_BAD_CABLE,
0x8: SFP.SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED,
0xc: SFP.SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED
}
SDK_ERRORS_TO_DESCRIPTION = {
0x1: SFP.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
0x4: SFP.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
0x8: SFP.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
0xc: SFP.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED
}
sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
SDK_SFP_STATE_IN: str(SFP.SFP_STATUS_BIT_INSERTED),
SDK_SFP_STATE_OUT: str(SFP.SFP_STATUS_BIT_REMOVED),
SDK_SFP_STATE_ERR: STATUS_ERROR,
SDK_SFP_STATE_DIS: STATUS_PLUGOUT,
SDK_SFP_STATE_DIS: str(SFP.SFP_STATUS_BIT_REMOVED),
}
# system level event/error
@ -195,7 +214,7 @@ class sfp_event:
delete_sx_fd_t_p(self.rx_fd_p)
delete_sx_user_channel_t_p(self.user_channel_p)
def check_sfp_status(self, port_change, timeout):
def check_sfp_status(self, port_change, error_dict, timeout):
"""
the meaning of timeout is aligned with select.select, which has the following meaning:
0: poll, returns without blocked
@ -233,6 +252,7 @@ class sfp_event:
break
sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN)
error_description = None
if sfp_state == STATUS_UNKNOWN:
# in the following sequence, STATUS_UNKNOWN can be returned.
# so we shouldn't raise exception here.
@ -247,18 +267,29 @@ class sfp_event:
# If get SFP status error(0x3) from SDK, then need to read the error_type to get the detailed error
if sfp_state == STATUS_ERROR:
if error_type in sdk_sfp_err_type_dict.keys():
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
sfp_state = sdk_sfp_err_type_dict[error_type]
else:
# For errors don't block the eeprom accessing, we don't report it to XCVRD
logger.log_info("SFP error on port but not blocking eeprom read, error_type {}".format(error_type))
found +=1
sfp_state_bits = SDK_ERRORS_TO_ERROR_BITS.get(error_type)
if sfp_state_bits is None:
logger.log_error("Unrecognized error {} detected on ports {}".format(error_type, port_list))
found += 1
continue
if error_type in SDK_SFP_BLOCKING_ERRORS:
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING
# An error should be always set along with 'INSERTED'
sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED
# For vendor specific errors, the description should be returned as well
error_description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
sfp_state = str(sfp_state_bits)
for port in port_list:
logger.log_info("SFP on port {} state {}".format(port, sfp_state))
port_change[port+1] = sfp_state
if error_description:
error_dict[port+1] = error_description
found += 1
return found != 0

View File

@ -8,8 +8,11 @@ test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
os.environ["PLATFORM_API_UNIT_TESTING"] = "1"
from sonic_py_common import device_info
from sonic_platform.sfp import SFP
from sonic_platform.sfp import SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
from sonic_platform.chassis import Chassis
@ -26,8 +29,14 @@ def mock_get_sdk_handle(self):
self.sdk_handle = 1
return self.sdk_handle
def mock_get_sfp_error_code(self):
return self.oper_code, self.error_code
device_info.get_platform = mock_get_platform
SFP._read_eeprom_specific_bytes = mock_read_eeprom_specific_bytes
SFP._get_error_code = mock_get_sfp_error_code
Chassis.get_sdk_handle = mock_get_sdk_handle
@ -82,3 +91,35 @@ def test_sfp_full_initialize_without_partial():
# Verify when get_sfp is called, the SFP modules won't be initialized again
sfp1 = allsfp[0]
assert sfp1 == chassis.get_sfp(1)
def test_sfp_get_error_status():
chassis = Chassis()
# Fetch an SFP module to test
sfp = chassis.get_sfp(1)
description_dict = sfp._get_error_description_dict()
sfp.oper_code = SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR
for error in description_dict.keys():
sfp.error_code = error
description = sfp.get_error_description()
assert description == description_dict[sfp.error_code]
sfp.error_code = -1
description = sfp.get_error_description()
assert description == "Unknown error (-1)"
expected_description_list = [
(SX_PORT_MODULE_STATUS_INITIALIZING, "Initializing"),
(SX_PORT_MODULE_STATUS_PLUGGED, "OK"),
(SX_PORT_MODULE_STATUS_UNPLUGGED, "Unplugged"),
(SX_PORT_MODULE_STATUS_PLUGGED_DISABLED, "Disabled")
]
for oper_code, expected_description in expected_description_list:
sfp.oper_code = oper_code
description = sfp.get_error_description()
assert description == expected_description

View File

@ -0,0 +1,46 @@
import os
import select
import sys
from mock import MagicMock
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from sonic_platform_base.sfp_base import SfpBase
class TestSfpEvent(object):
@classmethod
def setup_class(cls):
os.environ["MLNX_PLATFORM_API_UNIT_TESTING"] = "1"
select.select = MagicMock(return_value=([99], None, None))
def test_check_sfp_status(self):
from sonic_platform.sfp_event import SDK_SFP_STATE_IN, SDK_SFP_STATE_OUT, SDK_SFP_STATE_ERR
from sonic_platform.sfp_event import SDK_ERRORS_TO_ERROR_BITS, SDK_ERRORS_TO_DESCRIPTION, SDK_SFP_BLOCKING_ERRORS
self.executor(SDK_SFP_STATE_IN, None, SfpBase.SFP_STATUS_BIT_INSERTED)
self.executor(SDK_SFP_STATE_OUT, None, SfpBase.SFP_STATUS_BIT_REMOVED)
for error_type, error_status in SDK_ERRORS_TO_ERROR_BITS.items():
description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
if error_type in SDK_SFP_BLOCKING_ERRORS:
error_status |= SfpBase.SFP_ERROR_BIT_BLOCKING
error_status |= SfpBase.SFP_STATUS_BIT_INSERTED
self.executor(SDK_SFP_STATE_ERR, error_type, error_status, description)
def executor(self, mock_module_state, mock_error_type, expect_status, description=None):
from sonic_platform.sfp_event import sfp_event
event = sfp_event()
event.on_pmpe = MagicMock(return_value=(True, [0,1], mock_module_state, mock_error_type))
port_change = {}
error_dict = {}
found = event.check_sfp_status(port_change, error_dict, 0)
assert found
expect_status_str = str(expect_status)
assert 1 in port_change and port_change[1] == expect_status_str
assert 2 in port_change and port_change[2] == expect_status_str
if description:
assert 1 in error_dict and error_dict[1] == description
assert 2 in error_dict and error_dict[2] == description