[Mellanox] Fix issues found for CMIS host management (#17637)

- Why I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How to verify it
Manual test
Unit test
This commit is contained in:
Junchao-Mellanox 2024-01-04 15:42:33 +08:00 committed by GitHub
parent c20abb9e28
commit ee49d0dfec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 109 additions and 80 deletions

View File

@ -327,17 +327,10 @@ class SFP(NvidiaSFPCommon):
Returns:
bool: True if device is present, False if not
"""
if DeviceDataManager.is_independent_mode():
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0:
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'):
return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'):
return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'):
return False
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1:
return False
try:
self.is_sw_control()
except:
return False
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
return eeprom_raw is not None
@ -877,6 +870,13 @@ class SFP(NvidiaSFPCommon):
return [False] * api.NUM_CHANNELS if api else None
def get_temperature(self):
"""Get SFP temperature
Returns:
None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable)
0.0 if module temperature is not supported or module is under initialization
other float value if module temperature is available
"""
try:
if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
@ -893,59 +893,68 @@ class SFP(NvidiaSFPCommon):
temperature = super().get_temperature()
return temperature if temperature is not None else None
def get_temperature_warning_threashold(self):
def get_temperature_warning_threshold(self):
"""Get temperature warning threshold
Returns:
int: temperature warning threshold
None if there is an error (module EEPROM not readable)
0.0 if warning threshold is not supported or module is under initialization
other float value if warning threshold is available
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
self.is_sw_control()
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
return 0.0
support, thresh = self._get_temperature_threshold()
if support is None or thresh is None:
# Failed to read from EEPROM
return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD)
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_temperature_critical_threashold(self):
def get_temperature_critical_threshold(self):
"""Get temperature critical threshold
Returns:
int: temperature critical threshold
None if there is an error (module EEPROM not readable)
0.0 if critical threshold is not supported or module is under initialization
other float value if critical threshold is available
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
self.is_sw_control()
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
return 0.0
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
support, thresh = self._get_temperature_threshold()
if support is None or thresh is None:
# Failed to read from EEPROM
return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD)
def _get_temperature_threshold(self):
"""Get temperature thresholds data from EEPROM
Returns:
tuple: (support, thresh_dict)
"""
self.reinit()
api = self.get_xcvr_api()
if not api:
return None
return None, None
thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None
return thresh_support, {}
def get_xcvr_api(self):
"""
@ -964,17 +973,22 @@ class SFP(NvidiaSFPCommon):
def is_sw_control(self):
if not DeviceDataManager.is_independent_mode():
return False
db = utils.DbUtils.get_db_instance('STATE_DB')
logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index)
if not logical_port:
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}')
if not initialized:
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1
try:
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control',
raise_exception=True, log_func=None) == 1
except:
# just in case control file does not exist
raise Exception(f'Module {self.sdk_index} is under initialization')
class RJ45Port(NvidiaSFPCommon):

View File

@ -431,7 +431,8 @@ class ModuleThermal(ThermalBase):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature()
value = self.sfp.get_temperature()
return value if (value != 0.0 and value is not None) else None
def get_high_threshold(self):
"""
@ -441,7 +442,8 @@ class ModuleThermal(ThermalBase):
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_warning_threashold()
value = self.sfp.get_temperature_warning_threshold()
return value if (value != 0.0 and value is not None) else None
def get_high_critical_threshold(self):
"""
@ -451,7 +453,8 @@ class ModuleThermal(ThermalBase):
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_critical_threashold()
value = self.sfp.get_temperature_critical_threshold()
return value if (value != 0.0 and value is not None) else None
def get_position_in_parent(self):
"""

View File

@ -46,5 +46,5 @@ class ThermalManager(ThermalManagerBase):
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task:
cls.thermal_updater_task.stop()

View File

@ -56,7 +56,7 @@ class ThermalUpdater:
def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE)
data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
@ -108,7 +108,7 @@ class ThermalUpdater:
def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60
max_wait_time = 300
ready_set = set()
while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list:
@ -129,11 +129,11 @@ class ThermalUpdater:
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
def get_asic_temp_warning_threashold(self):
def get_asic_temp_warning_threshold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_asic_temp_critical_threashold(self):
def get_asic_temp_critical_threshold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
@ -148,19 +148,19 @@ class ThermalUpdater:
critical_thresh = 0
fault = 0
else:
warning_thresh = sfp.get_temperature_warning_threashold()
critical_thresh = sfp.get_temperature_critical_threashold()
warning_thresh = sfp.get_temperature_warning_threshold()
critical_thresh = sfp.get_temperature_critical_threshold()
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE
warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE
critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
temperature,
critical_thresh,
warning_thresh,
int(temperature),
int(critical_thresh),
int(warning_thresh),
fault
)
else:
@ -170,7 +170,7 @@ class ThermalUpdater:
if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence
except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
@ -187,8 +187,8 @@ class ThermalUpdater:
def update_asic(self):
try:
asic_temp = self.get_asic_temp()
warn_threshold = self.get_asic_temp_warning_threashold()
critical_threshold = self.get_asic_temp_critical_threashold()
warn_threshold = self.get_asic_temp_warning_threshold()
critical_threshold = self.get_asic_temp_critical_threshold()
fault = 0
if asic_temp is None:
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
@ -203,7 +203,7 @@ class ThermalUpdater:
fault
)
except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}')
logger.log_error(f'Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
0,

View File

@ -230,14 +230,18 @@ class TestSfp:
assert page == '/tmp/1/data'
assert page_offset is 0
@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
def test_sfp_get_presence(self, mock_read):
def test_sfp_get_presence(self, mock_read, mock_control):
sfp = SFP(0)
mock_read.return_value = None
assert not sfp.get_presence()
mock_read.return_value = 0
assert sfp.get_presence()
mock_control.side_effect = RuntimeError('')
assert not sfp.get_presence()
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_rj45_get_presence(self, mock_read_int):
@ -318,14 +322,16 @@ class TestSfp:
def test_get_temperature_threshold(self):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
mock_api = mock.MagicMock()
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
sfp.get_xcvr_api = mock.MagicMock(return_value=None)
assert sfp.get_temperature_warning_threshold() is None
assert sfp.get_temperature_critical_threshold() is None
sfp.get_xcvr_api.return_value = mock_api
assert sfp.get_temperature_warning_threshold() == 0.0
assert sfp.get_temperature_critical_threshold() == 0.0
from sonic_platform_base.sonic_xcvr.fields import consts
mock_api.get_transceiver_thresholds_support.return_value = True
@ -334,8 +340,8 @@ class TestSfp:
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
consts.TEMP_HIGH_WARNING_FIELD: 75.0
})
assert sfp.get_temperature_warning_threashold() == 75.0
assert sfp.get_temperature_critical_threashold() == 85.0
assert sfp.get_temperature_warning_threshold() == 75.0
assert sfp.get_temperature_critical_threshold() == 85.0
@mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index')
@mock.patch('sonic_platform.utils.read_int_from_file')

View File

@ -160,11 +160,17 @@ class TestThermal:
assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False
sfp.get_temperature = mock.MagicMock(return_value=35.4)
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80)
assert thermal.get_temperature() == 35.4
assert thermal.get_high_threshold() == 70
assert thermal.get_high_critical_threshold() == 80
sfp.get_temperature = mock.MagicMock(return_value=0)
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0)
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None
assert thermal.get_high_threshold() is None
assert thermal.get_high_critical_threshold() is None
@mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_temperature(self, mock_read):

View File

@ -97,23 +97,23 @@ class TestThermalUpdater:
mock_read.return_value = 8
updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000
assert updater.get_asic_temp_warning_threashold() == 1000
assert updater.get_asic_temp_critical_threashold() == 1000
assert updater.get_asic_temp_warning_threshold() == 1000
assert updater.get_asic_temp_critical_threshold() == 1000
updater.update_asic()
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
mock_read.return_value = None
assert updater.get_asic_temp() is None
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def test_update_module(self):
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True)
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0)
updater = ThermalUpdater([mock_sfp])
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)