[Mellanox] Fix issues found for CMIS host management (#17637)

- Why I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How to verify it
Manual test
Unit test
This commit is contained in:
Junchao-Mellanox 2024-01-04 15:42:33 +08:00 committed by mssonicbld
parent 3f29b28b36
commit 8d65e2c517
7 changed files with 109 additions and 80 deletions

View File

@ -327,17 +327,10 @@ class SFP(NvidiaSFPCommon):
Returns: Returns:
bool: True if device is present, False if not bool: True if device is present, False if not
""" """
if DeviceDataManager.is_independent_mode(): try:
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0: self.is_sw_control()
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'): except:
return False return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'):
return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'):
return False
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1:
return False
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False) eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
return eeprom_raw is not None return eeprom_raw is not None
@ -877,6 +870,13 @@ class SFP(NvidiaSFPCommon):
return [False] * api.NUM_CHANNELS if api else None return [False] * api.NUM_CHANNELS if api else None
def get_temperature(self): def get_temperature(self):
"""Get SFP temperature
Returns:
None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable)
0.0 if module temperature is not supported or module is under initialization
other float value if module temperature is available
"""
try: try:
if not self.is_sw_control(): if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input' temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
@ -893,59 +893,68 @@ class SFP(NvidiaSFPCommon):
temperature = super().get_temperature() temperature = super().get_temperature()
return temperature if temperature is not None else None return temperature if temperature is not None else None
def get_temperature_warning_threashold(self): def get_temperature_warning_threshold(self):
"""Get temperature warning threshold """Get temperature warning threshold
Returns: Returns:
int: temperature warning threshold None if there is an error (module EEPROM not readable)
0.0 if warning threshold is not supported or module is under initialization
other float value if warning threshold is available
""" """
try: try:
if not self.is_sw_control(): self.is_sw_control()
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except: except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD return 0.0
thresh = self._get_temperature_threshold() support, thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh: if support is None or thresh is None:
return thresh[consts.TEMP_HIGH_WARNING_FIELD] # Failed to read from EEPROM
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD)
def get_temperature_critical_threashold(self): def get_temperature_critical_threshold(self):
"""Get temperature critical threshold """Get temperature critical threshold
Returns: Returns:
int: temperature critical threshold None if there is an error (module EEPROM not readable)
0.0 if critical threshold is not supported or module is under initialization
other float value if critical threshold is available
""" """
try: try:
if not self.is_sw_control(): self.is_sw_control()
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except: except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD return 0.0
thresh = self._get_temperature_threshold() support, thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh: if support is None or thresh is None:
return thresh[consts.TEMP_HIGH_ALARM_FIELD] # Failed to read from EEPROM
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD)
def _get_temperature_threshold(self): def _get_temperature_threshold(self):
"""Get temperature thresholds data from EEPROM
Returns:
tuple: (support, thresh_dict)
"""
self.reinit() self.reinit()
api = self.get_xcvr_api() api = self.get_xcvr_api()
if not api: if not api:
return None return None, None
thresh_support = api.get_transceiver_thresholds_support() thresh_support = api.get_transceiver_thresholds_support()
if thresh_support: if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api): if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD) return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD) return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else: else:
return None return thresh_support, {}
def get_xcvr_api(self): def get_xcvr_api(self):
""" """
@ -968,13 +977,18 @@ class SFP(NvidiaSFPCommon):
db = utils.DbUtils.get_db_instance('STATE_DB') db = utils.DbUtils.get_db_instance('STATE_DB')
logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index) logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index)
if not logical_port: if not logical_port:
raise Exception(f'Module {self.sdk_index} is not present or in initialization') raise Exception(f'Module {self.sdk_index} is not present or under initialization')
initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}') initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}')
if not initialized: if not initialized:
raise Exception(f'Module {self.sdk_index} is not present or in initialization') raise Exception(f'Module {self.sdk_index} is not present or under initialization')
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1 try:
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control',
raise_exception=True, log_func=None) == 1
except:
# just in case control file does not exist
raise Exception(f'Module {self.sdk_index} is under initialization')
class RJ45Port(NvidiaSFPCommon): class RJ45Port(NvidiaSFPCommon):

View File

@ -431,7 +431,8 @@ class ModuleThermal(ThermalBase):
A float number of current temperature in Celsius up to nearest thousandth A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125 of one degree Celsius, e.g. 30.125
""" """
return self.sfp.get_temperature() value = self.sfp.get_temperature()
return value if (value != 0.0 and value is not None) else None
def get_high_threshold(self): def get_high_threshold(self):
""" """
@ -441,7 +442,8 @@ class ModuleThermal(ThermalBase):
A float number, the high threshold temperature of thermal in Celsius A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125 up to nearest thousandth of one degree Celsius, e.g. 30.125
""" """
return self.sfp.get_temperature_warning_threashold() value = self.sfp.get_temperature_warning_threshold()
return value if (value != 0.0 and value is not None) else None
def get_high_critical_threshold(self): def get_high_critical_threshold(self):
""" """
@ -451,7 +453,8 @@ class ModuleThermal(ThermalBase):
A float number, the high critical threshold temperature of thermal in Celsius A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125 up to nearest thousandth of one degree Celsius, e.g. 30.125
""" """
return self.sfp.get_temperature_critical_threashold() value = self.sfp.get_temperature_critical_threshold()
return value if (value != 0.0 and value is not None) else None
def get_position_in_parent(self): def get_position_in_parent(self):
""" """

View File

@ -46,5 +46,5 @@ class ThermalManager(ThermalManagerBase):
is a no-op. is a no-op.
:return: :return:
""" """
if DeviceDataManager.is_independent_mode(): if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task:
cls.thermal_updater_task.stop() cls.thermal_updater_task.stop()

View File

@ -56,7 +56,7 @@ class ThermalUpdater:
def load_tc_config(self): def load_tc_config(self):
asic_poll_interval = 1 asic_poll_interval = 1
sfp_poll_interval = 10 sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE) data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
if not data: if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
@ -108,7 +108,7 @@ class ThermalUpdater:
def wait_all_sfp_ready(self): def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...') logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60 max_wait_time = 300
ready_set = set() ready_set = set()
while len(ready_set) != len(self._sfp_list): while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list: for sfp in self._sfp_list:
@ -129,11 +129,11 @@ class ThermalUpdater:
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
def get_asic_temp_warning_threashold(self): def get_asic_temp_warning_threshold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None) emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_asic_temp_critical_threashold(self): def get_asic_temp_critical_threshold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None) critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
@ -148,19 +148,19 @@ class ThermalUpdater:
critical_thresh = 0 critical_thresh = 0
fault = 0 fault = 0
else: else:
warning_thresh = sfp.get_temperature_warning_threashold() warning_thresh = sfp.get_temperature_warning_threshold()
critical_thresh = sfp.get_temperature_critical_threashold() critical_thresh = sfp.get_temperature_critical_threshold()
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0 fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE) temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE) warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE) critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE
hw_management_independent_mode_update.thermal_data_set_module( hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now 0, # ASIC index always 0 for now
sfp.sdk_index + 1, sfp.sdk_index + 1,
temperature, int(temperature),
critical_thresh, int(critical_thresh),
warning_thresh, int(warning_thresh),
fault fault
) )
else: else:
@ -170,7 +170,7 @@ class ThermalUpdater:
if pre_presence != presence: if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence self._sfp_status[sfp.sdk_index] = presence
except Exception as e: except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}') logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module( hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now 0, # ASIC index always 0 for now
sfp.sdk_index + 1, sfp.sdk_index + 1,
@ -187,8 +187,8 @@ class ThermalUpdater:
def update_asic(self): def update_asic(self):
try: try:
asic_temp = self.get_asic_temp() asic_temp = self.get_asic_temp()
warn_threshold = self.get_asic_temp_warning_threashold() warn_threshold = self.get_asic_temp_warning_threshold()
critical_threshold = self.get_asic_temp_critical_threashold() critical_threshold = self.get_asic_temp_critical_threshold()
fault = 0 fault = 0
if asic_temp is None: if asic_temp is None:
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc') logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
@ -203,7 +203,7 @@ class ThermalUpdater:
fault fault
) )
except Exception as e: except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}') logger.log_error(f'Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic( hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now 0, # ASIC index always 0 for now
0, 0,

View File

@ -230,8 +230,9 @@ class TestSfp:
assert page == '/tmp/1/data' assert page == '/tmp/1/data'
assert page_offset is 0 assert page_offset is 0
@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
@mock.patch('sonic_platform.sfp.SFP._read_eeprom') @mock.patch('sonic_platform.sfp.SFP._read_eeprom')
def test_sfp_get_presence(self, mock_read): def test_sfp_get_presence(self, mock_read, mock_control):
sfp = SFP(0) sfp = SFP(0)
mock_read.return_value = None mock_read.return_value = None
assert not sfp.get_presence() assert not sfp.get_presence()
@ -239,6 +240,9 @@ class TestSfp:
mock_read.return_value = 0 mock_read.return_value = 0
assert sfp.get_presence() assert sfp.get_presence()
mock_control.side_effect = RuntimeError('')
assert not sfp.get_presence()
@mock.patch('sonic_platform.utils.read_int_from_file') @mock.patch('sonic_platform.utils.read_int_from_file')
def test_rj45_get_presence(self, mock_read_int): def test_rj45_get_presence(self, mock_read_int):
sfp = RJ45Port(0) sfp = RJ45Port(0)
@ -318,14 +322,16 @@ class TestSfp:
def test_get_temperature_threshold(self): def test_get_temperature_threshold(self):
sfp = SFP(0) sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True) sfp.is_sw_control = mock.MagicMock(return_value=True)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
mock_api = mock.MagicMock() mock_api = mock.MagicMock()
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False) mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api) sfp.get_xcvr_api = mock.MagicMock(return_value=None)
assert sfp.get_temperature_warning_threashold() == 70.0 assert sfp.get_temperature_warning_threshold() is None
assert sfp.get_temperature_critical_threashold() == 80.0 assert sfp.get_temperature_critical_threshold() is None
sfp.get_xcvr_api.return_value = mock_api
assert sfp.get_temperature_warning_threshold() == 0.0
assert sfp.get_temperature_critical_threshold() == 0.0
from sonic_platform_base.sonic_xcvr.fields import consts from sonic_platform_base.sonic_xcvr.fields import consts
mock_api.get_transceiver_thresholds_support.return_value = True mock_api.get_transceiver_thresholds_support.return_value = True
@ -334,8 +340,8 @@ class TestSfp:
consts.TEMP_HIGH_ALARM_FIELD: 85.0, consts.TEMP_HIGH_ALARM_FIELD: 85.0,
consts.TEMP_HIGH_WARNING_FIELD: 75.0 consts.TEMP_HIGH_WARNING_FIELD: 75.0
}) })
assert sfp.get_temperature_warning_threashold() == 75.0 assert sfp.get_temperature_warning_threshold() == 75.0
assert sfp.get_temperature_critical_threashold() == 85.0 assert sfp.get_temperature_critical_threshold() == 85.0
@mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index') @mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index')
@mock.patch('sonic_platform.utils.read_int_from_file') @mock.patch('sonic_platform.utils.read_int_from_file')

View File

@ -160,11 +160,17 @@ class TestThermal:
assert thermal.get_position_in_parent() == 1 assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False assert thermal.is_replaceable() == False
sfp.get_temperature = mock.MagicMock(return_value=35.4) sfp.get_temperature = mock.MagicMock(return_value=35.4)
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70) sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80) sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80)
assert thermal.get_temperature() == 35.4 assert thermal.get_temperature() == 35.4
assert thermal.get_high_threshold() == 70 assert thermal.get_high_threshold() == 70
assert thermal.get_high_critical_threshold() == 80 assert thermal.get_high_critical_threshold() == 80
sfp.get_temperature = mock.MagicMock(return_value=0)
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0)
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None
assert thermal.get_high_threshold() is None
assert thermal.get_high_critical_threshold() is None
@mock.patch('sonic_platform.utils.read_float_from_file') @mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_temperature(self, mock_read): def test_get_temperature(self, mock_read):

View File

@ -97,23 +97,23 @@ class TestThermalUpdater:
mock_read.return_value = 8 mock_read.return_value = 8
updater = ThermalUpdater(None) updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000 assert updater.get_asic_temp() == 1000
assert updater.get_asic_temp_warning_threashold() == 1000 assert updater.get_asic_temp_warning_threshold() == 1000
assert updater.get_asic_temp_critical_threashold() == 1000 assert updater.get_asic_temp_critical_threshold() == 1000
updater.update_asic() updater.update_asic()
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once() hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
mock_read.return_value = None mock_read.return_value = None
assert updater.get_asic_temp() is None assert updater.get_asic_temp() is None
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def test_update_module(self): def test_update_module(self):
mock_sfp = mock.MagicMock() mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10 mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True) mock_sfp.get_presence = mock.MagicMock(return_value=True)
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0) mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0) mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0) mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0)
updater = ThermalUpdater([mock_sfp]) updater = ThermalUpdater([mock_sfp])
updater.update_module() updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0) hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)