From f407a10c274047a917fd8c1052c3bbfafad94d64 Mon Sep 17 00:00:00 2001 From: mssonicbld <79238446+mssonicbld@users.noreply.github.com> Date: Fri, 30 Jun 2023 00:25:50 +0800 Subject: [PATCH] [Mellanox] Adjust warning threshold implementation according to the latest algorithm update (#15092) (#15665) --- .../mlnx-platform-api/sonic_platform/psu.py | 82 ++++++++++++------- .../mlnx-platform-api/tests/test_psu.py | 10 +-- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 521183ca1f..cc5bb61b96 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2022 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -512,54 +512,74 @@ class Psu(FixedPsu): return float(amperes) / 1000 return None - def _get_psu_power_threshold(self, temp_threshold_path): - """ - Calculate power threshold for a PSU according to the maximum power capacity and ambient temperature - amb_temp = min(port_amb, fan_amb) - If amb_temp < ambient_temp_threshold - threshold = max capacity - else - threshold = max capacity - slope*(amb_temp - ambient_temp_threshold) - """ - if self.get_powergood_status(): - if os.path.exists(self.psu_power_max_capacity): - power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity) - temp_threshold = utils.read_int_from_file(temp_threshold_path) - fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP) - port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP) - ambient_temp = min(fan_ambient_temp, port_ambient_temp) - if ambient_temp < temp_threshold: - power_threshold = power_max_capacity - else: - slope = utils.read_int_from_file(self.psu_power_slope) - power_threshold = power_max_capacity - (ambient_temp - temp_threshold) * slope - if power_threshold <= 0: - logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_threshold, self.get_name())) - power_threshold = 0 - return float(power_threshold) / 1000000 - - return None - def get_psu_power_warning_suppress_threshold(self): """ Retrieve the warning suppress threshold of the power on this PSU The value can be volatile, so the caller should call the API each time it is used. On Mellanox platform, it is translated from the `warning threshold` + The formula to calculate power warning threshold for a PSU + amb_temp = min(port_amb, fan_amb) + If amb_temp < ambient_temp_warning_threshold + threshold = max capacity - slope + else + threshold = max capacity - slope * (1 + amb_temp - ambient_temp_threshold) + Returns: A float number, the warning suppress threshold of the PSU in watts. """ - return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_WARNING_THRESHOLD) + if self.get_powergood_status(): + if os.path.exists(self.psu_power_max_capacity): + power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity) + temp_warning_threshold = utils.read_int_from_file(Psu.AMBIENT_TEMP_WARNING_THRESHOLD) + fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP) + port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP) + ambient_temp = min(fan_ambient_temp, port_ambient_temp) + slope = utils.read_int_from_file(self.psu_power_slope) * 1000 + if ambient_temp < temp_warning_threshold: + power_warning_threshold = power_max_capacity - slope * 1000 + else: + power_warning_threshold = power_max_capacity - (1000 + ambient_temp - temp_warning_threshold) * slope + if power_warning_threshold <= 0: + logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_warning_threshold, self.get_name())) + power_warning_threshold = 0 + return float(power_warning_threshold) / 1000000 + + return None def get_psu_power_critical_threshold(self): """ Retrieve the critical threshold of the power on this PSU The value can be volatile, so the caller should call the API each time it is used. + The formula to calculate power critical threshold for a PSU + amb_temp = min(port_amb, fan_amb) + If amb_temp < ambient_temp_critical_threshold + threshold = max capacity + else + threshold = max capacity - slope*(amb_temp - ambient_temp_critical_threshold) + Returns: A float number, the critical threshold of the PSU in watts. """ - return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD) + if self.get_powergood_status(): + if os.path.exists(self.psu_power_max_capacity): + power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity) + temp_critical_threshold = utils.read_int_from_file(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD) + fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP) + port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP) + ambient_temp = min(fan_ambient_temp, port_ambient_temp) + if ambient_temp < temp_critical_threshold: + power_critical_threshold = power_max_capacity + else: + slope = utils.read_int_from_file(self.psu_power_slope) * 1000 + power_critical_threshold = power_max_capacity - (ambient_temp - temp_critical_threshold) * slope + if power_critical_threshold <= 0: + logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_critical_threshold, self.get_name())) + power_critical_threshold = 0 + return float(power_critical_threshold) / 1000000 + + return None class InvalidPsuVolWA: diff --git a/platform/mellanox/mlnx-platform-api/tests/test_psu.py b/platform/mellanox/mlnx-platform-api/tests/test_psu.py index 00c3bc8623..2882776b62 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_psu.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_psu.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -172,24 +172,24 @@ class TestPsu: psu.psu_power_max_capacity: 100000000, psu.AMBIENT_TEMP_CRITICAL_THRESHOLD: 65000, psu.AMBIENT_TEMP_WARNING_THRESHOLD: 55000, - psu.psu_power_slope: 2000 + psu.psu_power_slope: 2 } normal_data = { psu.PORT_AMBIENT_TEMP: 55000, psu.FAN_AMBIENT_TEMP: 50000, - 'warning_threshold': 100.0, + 'warning_threshold': 98.0, 'critical_threshold': 100.0 } warning_data = { psu.PORT_AMBIENT_TEMP: 65000, psu.FAN_AMBIENT_TEMP: 60000, - 'warning_threshold': 90.0, + 'warning_threshold': 88.0, 'critical_threshold': 100.0 } critical_data = { psu.PORT_AMBIENT_TEMP: 70000, psu.FAN_AMBIENT_TEMP: 75000, - 'warning_threshold': 70.0, + 'warning_threshold': 68.0, 'critical_threshold': 90.0 } test_data = {}