[Mellanox] Support PSU power threshold checking (#11863)

* Support power threshold

Signed-off-by: Stephen Sun <stephens@nvidia.com>

* get_psu_power_warning_threshold => get_psu_power_warning_suppress_threshold

Signed-off-by: Stephen Sun <stephens@nvidia.com>

* Fix comments

Signed-off-by: Stephen Sun <stephens@nvidia.com>

Signed-off-by: Stephen Sun <stephens@nvidia.com>
This commit is contained in:
Stephen Sun 2022-11-22 06:47:43 +08:00 committed by GitHub
parent f402e6b5c6
commit 5d457596ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 112 additions and 0 deletions

View File

@ -216,6 +216,11 @@ class Psu(FixedPsu):
PSU_VPD = "eeprom/psu{}_vpd"
PSU_CURRENT_IN = "power/psu{}_curr_in"
PSU_VOLT_IN = "power/psu{}_volt_in"
PORT_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/port_amb")
FAN_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/fan_amb")
AMBIENT_TEMP_CRITICAL_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_crit_limit")
AMBIENT_TEMP_WARNING_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_warn_limit")
PSU_POWER_SLOPE = os.path.join(PSU_PATH, "config/psu_power_slope")
shared_led = None
@ -235,6 +240,8 @@ class Psu(FixedPsu):
self.psu_power_max = self.psu_power + "_max"
self.psu_presence = os.path.join(PSU_PATH, "thermal/psu{}_status".format(self.index))
self.psu_power_max_capacity = os.path.join(PSU_PATH, "config/psu{}_power_capacity".format(self.index))
self.psu_temp = os.path.join(PSU_PATH, 'thermal/psu{}_temp'.format(self.index))
self.psu_temp_threshold = os.path.join(PSU_PATH, 'thermal/psu{}_temp_max'.format(self.index))
@ -505,6 +512,56 @@ class Psu(FixedPsu):
return float(amperes) / 1000
return None
def _get_psu_power_threshold(self, temp_threshold_path):
"""
Calculate power threshold for a PSU according to the maximum power capacity and ambient temperature
amb_temp = min(port_amb, fan_amb)
If amb_temp < ambient_temp_threshold
threshold = max capacity
else
threshold = max capacity - slope*(amb_temp - ambient_temp_threshold)
"""
if self.get_powergood_status():
if os.path.exists(self.psu_power_max_capacity):
power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity)
temp_threshold = utils.read_int_from_file(temp_threshold_path)
fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP)
port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP)
ambient_temp = min(fan_ambient_temp, port_ambient_temp)
if ambient_temp < temp_threshold:
power_threshold = power_max_capacity
else:
slope = utils.read_int_from_file(Psu.PSU_POWER_SLOPE)
power_threshold = power_max_capacity - (ambient_temp - temp_threshold) * slope
if power_threshold <= 0:
logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_threshold, self.get_name()))
power_threshold = 0
return float(power_threshold) / 1000000
return None
def get_psu_power_warning_suppress_threshold(self):
"""
Retrieve the warning suppress threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
On Mellanox platform, it is translated from the `warning threshold`
Returns:
A float number, the warning suppress threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_WARNING_THRESHOLD)
def get_psu_power_critical_threshold(self):
"""
Retrieve the critical threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
Returns:
A float number, the critical threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD)
class InvalidPsuVolWA:
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:

View File

@ -161,3 +161,58 @@ class TestPsu:
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
mock_run_command.assert_called_with(['sensors', '-s'])
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_psu_power_threshold(self, mock_read_int_from_file):
Psu.all_psus_support_power_threshold = True
psu = Psu(0)
common_info = {
psu.psu_oper_status: 1,
psu.psu_power_max_capacity: 100000000,
psu.AMBIENT_TEMP_CRITICAL_THRESHOLD: 65000,
psu.AMBIENT_TEMP_WARNING_THRESHOLD: 55000,
psu.PSU_POWER_SLOPE: 2000
}
normal_data = {
psu.PORT_AMBIENT_TEMP: 55000,
psu.FAN_AMBIENT_TEMP: 50000,
'warning_threshold': 100.0,
'critical_threshold': 100.0
}
warning_data = {
psu.PORT_AMBIENT_TEMP: 65000,
psu.FAN_AMBIENT_TEMP: 60000,
'warning_threshold': 90.0,
'critical_threshold': 100.0
}
critical_data = {
psu.PORT_AMBIENT_TEMP: 70000,
psu.FAN_AMBIENT_TEMP: 75000,
'warning_threshold': 70.0,
'critical_threshold': 90.0
}
test_data = {}
def mock_side_effect(value):
if value in common_info:
return common_info[value]
else:
return test_data[value]
mock_read_int_from_file.side_effect = mock_side_effect
test_data = normal_data
assert psu.get_psu_power_warning_suppress_threshold() == normal_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == normal_data['critical_threshold']
test_data = warning_data
assert psu.get_psu_power_warning_suppress_threshold() == warning_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == warning_data['critical_threshold']
test_data = critical_data
assert psu.get_psu_power_warning_suppress_threshold() == critical_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == critical_data['critical_threshold']
def test_psu_not_support_power_threshold(self):
psu = Psu(0)
assert psu.get_psu_power_warning_suppress_threshold() is None
assert psu.get_psu_power_critical_threshold() is None