[Mellanox] Fix issues found for CMIS host management (#17637)
- Why I did it 1. Thermal updater should wait more time for module to be initialized 2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported 3. Rename sfp function to fix typo 4. sfp.get_presence should return False if module is under initialization - How I did it 1. Thermal updater should wait more time for module to be initialized 2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported 3. Rename sfp function to fix typo 4. sfp.get_presence should return False if module is under initialization - How to verify it Manual test Unit test
This commit is contained in:
parent
c20abb9e28
commit
ee49d0dfec
@ -327,17 +327,10 @@ class SFP(NvidiaSFPCommon):
|
||||
Returns:
|
||||
bool: True if device is present, False if not
|
||||
"""
|
||||
if DeviceDataManager.is_independent_mode():
|
||||
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0:
|
||||
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'):
|
||||
return False
|
||||
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'):
|
||||
return False
|
||||
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'):
|
||||
return False
|
||||
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1:
|
||||
return False
|
||||
|
||||
try:
|
||||
self.is_sw_control()
|
||||
except:
|
||||
return False
|
||||
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
|
||||
return eeprom_raw is not None
|
||||
|
||||
@ -877,6 +870,13 @@ class SFP(NvidiaSFPCommon):
|
||||
return [False] * api.NUM_CHANNELS if api else None
|
||||
|
||||
def get_temperature(self):
|
||||
"""Get SFP temperature
|
||||
|
||||
Returns:
|
||||
None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable)
|
||||
0.0 if module temperature is not supported or module is under initialization
|
||||
other float value if module temperature is available
|
||||
"""
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
|
||||
@ -893,59 +893,68 @@ class SFP(NvidiaSFPCommon):
|
||||
temperature = super().get_temperature()
|
||||
return temperature if temperature is not None else None
|
||||
|
||||
def get_temperature_warning_threashold(self):
|
||||
def get_temperature_warning_threshold(self):
|
||||
"""Get temperature warning threshold
|
||||
|
||||
Returns:
|
||||
int: temperature warning threshold
|
||||
None if there is an error (module EEPROM not readable)
|
||||
0.0 if warning threshold is not supported or module is under initialization
|
||||
other float value if warning threshold is available
|
||||
"""
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
|
||||
log_func=None,
|
||||
default=None)
|
||||
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
self.is_sw_control()
|
||||
except:
|
||||
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
return 0.0
|
||||
|
||||
support, thresh = self._get_temperature_threshold()
|
||||
if support is None or thresh is None:
|
||||
# Failed to read from EEPROM
|
||||
return None
|
||||
if support is False:
|
||||
# Do not support
|
||||
return 0.0
|
||||
return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD)
|
||||
|
||||
thresh = self._get_temperature_threshold()
|
||||
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
|
||||
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
|
||||
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
|
||||
def get_temperature_critical_threashold(self):
|
||||
def get_temperature_critical_threshold(self):
|
||||
"""Get temperature critical threshold
|
||||
|
||||
Returns:
|
||||
int: temperature critical threshold
|
||||
None if there is an error (module EEPROM not readable)
|
||||
0.0 if critical threshold is not supported or module is under initialization
|
||||
other float value if critical threshold is available
|
||||
"""
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
|
||||
log_func=None,
|
||||
default=None)
|
||||
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
self.is_sw_control()
|
||||
except:
|
||||
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
return 0.0
|
||||
|
||||
thresh = self._get_temperature_threshold()
|
||||
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
|
||||
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
|
||||
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
support, thresh = self._get_temperature_threshold()
|
||||
if support is None or thresh is None:
|
||||
# Failed to read from EEPROM
|
||||
return None
|
||||
if support is False:
|
||||
# Do not support
|
||||
return 0.0
|
||||
return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD)
|
||||
|
||||
def _get_temperature_threshold(self):
|
||||
"""Get temperature thresholds data from EEPROM
|
||||
|
||||
Returns:
|
||||
tuple: (support, thresh_dict)
|
||||
"""
|
||||
self.reinit()
|
||||
api = self.get_xcvr_api()
|
||||
if not api:
|
||||
return None
|
||||
return None, None
|
||||
|
||||
thresh_support = api.get_transceiver_thresholds_support()
|
||||
if thresh_support:
|
||||
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
|
||||
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
|
||||
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
|
||||
return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
|
||||
return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
|
||||
else:
|
||||
return None
|
||||
return thresh_support, {}
|
||||
|
||||
def get_xcvr_api(self):
|
||||
"""
|
||||
@ -964,17 +973,22 @@ class SFP(NvidiaSFPCommon):
|
||||
def is_sw_control(self):
|
||||
if not DeviceDataManager.is_independent_mode():
|
||||
return False
|
||||
|
||||
|
||||
db = utils.DbUtils.get_db_instance('STATE_DB')
|
||||
logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index)
|
||||
if not logical_port:
|
||||
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
|
||||
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
|
||||
|
||||
initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}')
|
||||
if not initialized:
|
||||
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
|
||||
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
|
||||
|
||||
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1
|
||||
try:
|
||||
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control',
|
||||
raise_exception=True, log_func=None) == 1
|
||||
except:
|
||||
# just in case control file does not exist
|
||||
raise Exception(f'Module {self.sdk_index} is under initialization')
|
||||
|
||||
|
||||
class RJ45Port(NvidiaSFPCommon):
|
||||
|
@ -431,7 +431,8 @@ class ModuleThermal(ThermalBase):
|
||||
A float number of current temperature in Celsius up to nearest thousandth
|
||||
of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature()
|
||||
value = self.sfp.get_temperature()
|
||||
return value if (value != 0.0 and value is not None) else None
|
||||
|
||||
def get_high_threshold(self):
|
||||
"""
|
||||
@ -441,7 +442,8 @@ class ModuleThermal(ThermalBase):
|
||||
A float number, the high threshold temperature of thermal in Celsius
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature_warning_threashold()
|
||||
value = self.sfp.get_temperature_warning_threshold()
|
||||
return value if (value != 0.0 and value is not None) else None
|
||||
|
||||
def get_high_critical_threshold(self):
|
||||
"""
|
||||
@ -451,7 +453,8 @@ class ModuleThermal(ThermalBase):
|
||||
A float number, the high critical threshold temperature of thermal in Celsius
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature_critical_threashold()
|
||||
value = self.sfp.get_temperature_critical_threshold()
|
||||
return value if (value != 0.0 and value is not None) else None
|
||||
|
||||
def get_position_in_parent(self):
|
||||
"""
|
||||
|
@ -46,5 +46,5 @@ class ThermalManager(ThermalManagerBase):
|
||||
is a no-op.
|
||||
:return:
|
||||
"""
|
||||
if DeviceDataManager.is_independent_mode():
|
||||
if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task:
|
||||
cls.thermal_updater_task.stop()
|
||||
|
@ -56,7 +56,7 @@ class ThermalUpdater:
|
||||
def load_tc_config(self):
|
||||
asic_poll_interval = 1
|
||||
sfp_poll_interval = 10
|
||||
data = utils.load_json_file(TC_CONFIG_FILE)
|
||||
data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
|
||||
if not data:
|
||||
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
|
||||
|
||||
@ -108,7 +108,7 @@ class ThermalUpdater:
|
||||
|
||||
def wait_all_sfp_ready(self):
|
||||
logger.log_notice('Waiting for all SFP modules ready...')
|
||||
max_wait_time = 60
|
||||
max_wait_time = 300
|
||||
ready_set = set()
|
||||
while len(ready_set) != len(self._sfp_list):
|
||||
for sfp in self._sfp_list:
|
||||
@ -129,11 +129,11 @@ class ThermalUpdater:
|
||||
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
|
||||
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
|
||||
|
||||
def get_asic_temp_warning_threashold(self):
|
||||
def get_asic_temp_warning_threshold(self):
|
||||
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
|
||||
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
|
||||
def get_asic_temp_critical_threashold(self):
|
||||
def get_asic_temp_critical_threshold(self):
|
||||
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
|
||||
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
@ -148,19 +148,19 @@ class ThermalUpdater:
|
||||
critical_thresh = 0
|
||||
fault = 0
|
||||
else:
|
||||
warning_thresh = sfp.get_temperature_warning_threashold()
|
||||
critical_thresh = sfp.get_temperature_critical_threashold()
|
||||
warning_thresh = sfp.get_temperature_warning_threshold()
|
||||
critical_thresh = sfp.get_temperature_critical_threshold()
|
||||
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
|
||||
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
|
||||
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
|
||||
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
|
||||
temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE
|
||||
warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE
|
||||
critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE
|
||||
|
||||
hw_management_independent_mode_update.thermal_data_set_module(
|
||||
0, # ASIC index always 0 for now
|
||||
sfp.sdk_index + 1,
|
||||
temperature,
|
||||
critical_thresh,
|
||||
warning_thresh,
|
||||
int(temperature),
|
||||
int(critical_thresh),
|
||||
int(warning_thresh),
|
||||
fault
|
||||
)
|
||||
else:
|
||||
@ -170,7 +170,7 @@ class ThermalUpdater:
|
||||
if pre_presence != presence:
|
||||
self._sfp_status[sfp.sdk_index] = presence
|
||||
except Exception as e:
|
||||
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
|
||||
logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}')
|
||||
hw_management_independent_mode_update.thermal_data_set_module(
|
||||
0, # ASIC index always 0 for now
|
||||
sfp.sdk_index + 1,
|
||||
@ -187,8 +187,8 @@ class ThermalUpdater:
|
||||
def update_asic(self):
|
||||
try:
|
||||
asic_temp = self.get_asic_temp()
|
||||
warn_threshold = self.get_asic_temp_warning_threashold()
|
||||
critical_threshold = self.get_asic_temp_critical_threashold()
|
||||
warn_threshold = self.get_asic_temp_warning_threshold()
|
||||
critical_threshold = self.get_asic_temp_critical_threshold()
|
||||
fault = 0
|
||||
if asic_temp is None:
|
||||
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
|
||||
@ -203,7 +203,7 @@ class ThermalUpdater:
|
||||
fault
|
||||
)
|
||||
except Exception as e:
|
||||
logger.log_error('Failed to update ASIC thermal data - {e}')
|
||||
logger.log_error(f'Failed to update ASIC thermal data - {e}')
|
||||
hw_management_independent_mode_update.thermal_data_set_asic(
|
||||
0, # ASIC index always 0 for now
|
||||
0,
|
||||
|
@ -230,14 +230,18 @@ class TestSfp:
|
||||
assert page == '/tmp/1/data'
|
||||
assert page_offset is 0
|
||||
|
||||
@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
|
||||
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
|
||||
def test_sfp_get_presence(self, mock_read):
|
||||
def test_sfp_get_presence(self, mock_read, mock_control):
|
||||
sfp = SFP(0)
|
||||
mock_read.return_value = None
|
||||
assert not sfp.get_presence()
|
||||
|
||||
mock_read.return_value = 0
|
||||
assert sfp.get_presence()
|
||||
|
||||
mock_control.side_effect = RuntimeError('')
|
||||
assert not sfp.get_presence()
|
||||
|
||||
@mock.patch('sonic_platform.utils.read_int_from_file')
|
||||
def test_rj45_get_presence(self, mock_read_int):
|
||||
@ -318,14 +322,16 @@ class TestSfp:
|
||||
def test_get_temperature_threshold(self):
|
||||
sfp = SFP(0)
|
||||
sfp.is_sw_control = mock.MagicMock(return_value=True)
|
||||
assert sfp.get_temperature_warning_threashold() == 70.0
|
||||
assert sfp.get_temperature_critical_threashold() == 80.0
|
||||
|
||||
mock_api = mock.MagicMock()
|
||||
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
|
||||
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
|
||||
assert sfp.get_temperature_warning_threashold() == 70.0
|
||||
assert sfp.get_temperature_critical_threashold() == 80.0
|
||||
sfp.get_xcvr_api = mock.MagicMock(return_value=None)
|
||||
assert sfp.get_temperature_warning_threshold() is None
|
||||
assert sfp.get_temperature_critical_threshold() is None
|
||||
|
||||
sfp.get_xcvr_api.return_value = mock_api
|
||||
assert sfp.get_temperature_warning_threshold() == 0.0
|
||||
assert sfp.get_temperature_critical_threshold() == 0.0
|
||||
|
||||
from sonic_platform_base.sonic_xcvr.fields import consts
|
||||
mock_api.get_transceiver_thresholds_support.return_value = True
|
||||
@ -334,8 +340,8 @@ class TestSfp:
|
||||
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
|
||||
consts.TEMP_HIGH_WARNING_FIELD: 75.0
|
||||
})
|
||||
assert sfp.get_temperature_warning_threashold() == 75.0
|
||||
assert sfp.get_temperature_critical_threashold() == 85.0
|
||||
assert sfp.get_temperature_warning_threshold() == 75.0
|
||||
assert sfp.get_temperature_critical_threshold() == 85.0
|
||||
|
||||
@mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index')
|
||||
@mock.patch('sonic_platform.utils.read_int_from_file')
|
||||
|
@ -160,11 +160,17 @@ class TestThermal:
|
||||
assert thermal.get_position_in_parent() == 1
|
||||
assert thermal.is_replaceable() == False
|
||||
sfp.get_temperature = mock.MagicMock(return_value=35.4)
|
||||
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
|
||||
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
|
||||
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70)
|
||||
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80)
|
||||
assert thermal.get_temperature() == 35.4
|
||||
assert thermal.get_high_threshold() == 70
|
||||
assert thermal.get_high_critical_threshold() == 80
|
||||
sfp.get_temperature = mock.MagicMock(return_value=0)
|
||||
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0)
|
||||
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None)
|
||||
assert thermal.get_temperature() is None
|
||||
assert thermal.get_high_threshold() is None
|
||||
assert thermal.get_high_critical_threshold() is None
|
||||
|
||||
@mock.patch('sonic_platform.utils.read_float_from_file')
|
||||
def test_get_temperature(self, mock_read):
|
||||
|
@ -97,23 +97,23 @@ class TestThermalUpdater:
|
||||
mock_read.return_value = 8
|
||||
updater = ThermalUpdater(None)
|
||||
assert updater.get_asic_temp() == 1000
|
||||
assert updater.get_asic_temp_warning_threashold() == 1000
|
||||
assert updater.get_asic_temp_critical_threashold() == 1000
|
||||
assert updater.get_asic_temp_warning_threshold() == 1000
|
||||
assert updater.get_asic_temp_critical_threshold() == 1000
|
||||
updater.update_asic()
|
||||
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
|
||||
|
||||
mock_read.return_value = None
|
||||
assert updater.get_asic_temp() is None
|
||||
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
def test_update_module(self):
|
||||
mock_sfp = mock.MagicMock()
|
||||
mock_sfp.sdk_index = 10
|
||||
mock_sfp.get_presence = mock.MagicMock(return_value=True)
|
||||
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
|
||||
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
|
||||
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
|
||||
mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0)
|
||||
mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0)
|
||||
updater = ThermalUpdater([mock_sfp])
|
||||
updater.update_module()
|
||||
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)
|
||||
|
Loading…
Reference in New Issue
Block a user