From 5d457596ba2da9a10bcca808a7a321063aa547a1 Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Tue, 22 Nov 2022 06:47:43 +0800 Subject: [PATCH] [Mellanox] Support PSU power threshold checking (#11863) * Support power threshold Signed-off-by: Stephen Sun * get_psu_power_warning_threshold => get_psu_power_warning_suppress_threshold Signed-off-by: Stephen Sun * Fix comments Signed-off-by: Stephen Sun Signed-off-by: Stephen Sun --- .../mlnx-platform-api/sonic_platform/psu.py | 57 +++++++++++++++++++ .../mlnx-platform-api/tests/test_psu.py | 55 ++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index e447bbb435..9ad97a688f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -216,6 +216,11 @@ class Psu(FixedPsu): PSU_VPD = "eeprom/psu{}_vpd" PSU_CURRENT_IN = "power/psu{}_curr_in" PSU_VOLT_IN = "power/psu{}_volt_in" + PORT_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/port_amb") + FAN_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/fan_amb") + AMBIENT_TEMP_CRITICAL_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_crit_limit") + AMBIENT_TEMP_WARNING_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_warn_limit") + PSU_POWER_SLOPE = os.path.join(PSU_PATH, "config/psu_power_slope") shared_led = None @@ -235,6 +240,8 @@ class Psu(FixedPsu): self.psu_power_max = self.psu_power + "_max" self.psu_presence = os.path.join(PSU_PATH, "thermal/psu{}_status".format(self.index)) + self.psu_power_max_capacity = os.path.join(PSU_PATH, "config/psu{}_power_capacity".format(self.index)) + self.psu_temp = os.path.join(PSU_PATH, 'thermal/psu{}_temp'.format(self.index)) self.psu_temp_threshold = os.path.join(PSU_PATH, 'thermal/psu{}_temp_max'.format(self.index)) @@ -505,6 +512,56 @@ class Psu(FixedPsu): return float(amperes) / 1000 return None + def _get_psu_power_threshold(self, temp_threshold_path): + """ + Calculate power threshold for a PSU according to the maximum power capacity and ambient temperature + amb_temp = min(port_amb, fan_amb) + If amb_temp < ambient_temp_threshold + threshold = max capacity + else + threshold = max capacity - slope*(amb_temp - ambient_temp_threshold) + """ + if self.get_powergood_status(): + if os.path.exists(self.psu_power_max_capacity): + power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity) + temp_threshold = utils.read_int_from_file(temp_threshold_path) + fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP) + port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP) + ambient_temp = min(fan_ambient_temp, port_ambient_temp) + if ambient_temp < temp_threshold: + power_threshold = power_max_capacity + else: + slope = utils.read_int_from_file(Psu.PSU_POWER_SLOPE) + power_threshold = power_max_capacity - (ambient_temp - temp_threshold) * slope + if power_threshold <= 0: + logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_threshold, self.get_name())) + power_threshold = 0 + return float(power_threshold) / 1000000 + + return None + + def get_psu_power_warning_suppress_threshold(self): + """ + Retrieve the warning suppress threshold of the power on this PSU + The value can be volatile, so the caller should call the API each time it is used. + On Mellanox platform, it is translated from the `warning threshold` + + Returns: + A float number, the warning suppress threshold of the PSU in watts. + """ + return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_WARNING_THRESHOLD) + + def get_psu_power_critical_threshold(self): + """ + Retrieve the critical threshold of the power on this PSU + The value can be volatile, so the caller should call the API each time it is used. + + Returns: + A float number, the critical threshold of the PSU in watts. + """ + return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD) + + class InvalidPsuVolWA: """This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following: diff --git a/platform/mellanox/mlnx-platform-api/tests/test_psu.py b/platform/mellanox/mlnx-platform-api/tests/test_psu.py index d92b8f716a..7fff0d5482 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_psu.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_psu.py @@ -161,3 +161,58 @@ class TestPsu: vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999 mock_run_command.assert_called_with(['sensors', '-s']) + + @mock.patch('os.path.exists', mock.MagicMock(return_value=True)) + @mock.patch('sonic_platform.utils.read_int_from_file') + def test_psu_power_threshold(self, mock_read_int_from_file): + Psu.all_psus_support_power_threshold = True + psu = Psu(0) + common_info = { + psu.psu_oper_status: 1, + psu.psu_power_max_capacity: 100000000, + psu.AMBIENT_TEMP_CRITICAL_THRESHOLD: 65000, + psu.AMBIENT_TEMP_WARNING_THRESHOLD: 55000, + psu.PSU_POWER_SLOPE: 2000 + } + normal_data = { + psu.PORT_AMBIENT_TEMP: 55000, + psu.FAN_AMBIENT_TEMP: 50000, + 'warning_threshold': 100.0, + 'critical_threshold': 100.0 + } + warning_data = { + psu.PORT_AMBIENT_TEMP: 65000, + psu.FAN_AMBIENT_TEMP: 60000, + 'warning_threshold': 90.0, + 'critical_threshold': 100.0 + } + critical_data = { + psu.PORT_AMBIENT_TEMP: 70000, + psu.FAN_AMBIENT_TEMP: 75000, + 'warning_threshold': 70.0, + 'critical_threshold': 90.0 + } + test_data = {} + def mock_side_effect(value): + if value in common_info: + return common_info[value] + else: + return test_data[value] + + mock_read_int_from_file.side_effect = mock_side_effect + test_data = normal_data + assert psu.get_psu_power_warning_suppress_threshold() == normal_data['warning_threshold'] + assert psu.get_psu_power_critical_threshold() == normal_data['critical_threshold'] + + test_data = warning_data + assert psu.get_psu_power_warning_suppress_threshold() == warning_data['warning_threshold'] + assert psu.get_psu_power_critical_threshold() == warning_data['critical_threshold'] + + test_data = critical_data + assert psu.get_psu_power_warning_suppress_threshold() == critical_data['warning_threshold'] + assert psu.get_psu_power_critical_threshold() == critical_data['critical_threshold'] + + def test_psu_not_support_power_threshold(self): + psu = Psu(0) + assert psu.get_psu_power_warning_suppress_threshold() is None + assert psu.get_psu_power_critical_threshold() is None