[Mellanox] Adjust warning threshold implementation according to the latest algorithm update (#15092)

- Why I did it
Adjust the warning threshold implementation according to the latest algorithm update

- How I did it
Modify power warning and critical thresholds methods

- How to verify it
Unit test updated to cover the change

Signed-off-by: Stephen Sun <stephens@nvidia.com>
This commit is contained in:
Stephen Sun 2023-06-13 20:14:10 +08:00 committed by GitHub
parent 3cb13226be
commit 238e6ffcc1
No account linked to committer's email address
2 changed files with 56 additions and 36 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022 NVIDIA CORPORATION & AFFILIATES.
# Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -512,54 +512,74 @@ class Psu(FixedPsu):
return float(amperes) / 1000
return None
def _get_psu_power_threshold(self, temp_threshold_path):
"""
Calculate power threshold for a PSU according to the maximum power capacity and ambient temperature
amb_temp = min(port_amb, fan_amb)
If amb_temp < ambient_temp_threshold
threshold = max capacity
else
threshold = max capacity - slope*(amb_temp - ambient_temp_threshold)
"""
if self.get_powergood_status():
if os.path.exists(self.psu_power_max_capacity):
power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity)
temp_threshold = utils.read_int_from_file(temp_threshold_path)
fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP)
port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP)
ambient_temp = min(fan_ambient_temp, port_ambient_temp)
if ambient_temp < temp_threshold:
power_threshold = power_max_capacity
else:
slope = utils.read_int_from_file(self.psu_power_slope)
power_threshold = power_max_capacity - (ambient_temp - temp_threshold) * slope
if power_threshold <= 0:
logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_threshold, self.get_name()))
power_threshold = 0
return float(power_threshold) / 1000000
return None
def get_psu_power_warning_suppress_threshold(self):
"""
Retrieve the warning suppress threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
On Mellanox platform, it is translated from the `warning threshold`
The formula to calculate power warning threshold for a PSU
amb_temp = min(port_amb, fan_amb)
If amb_temp < ambient_temp_warning_threshold
threshold = max capacity - slope
else
threshold = max capacity - slope * (1 + amb_temp - ambient_temp_threshold)
Returns:
A float number, the warning suppress threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_WARNING_THRESHOLD)
if self.get_powergood_status():
if os.path.exists(self.psu_power_max_capacity):
power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity)
temp_warning_threshold = utils.read_int_from_file(Psu.AMBIENT_TEMP_WARNING_THRESHOLD)
fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP)
port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP)
ambient_temp = min(fan_ambient_temp, port_ambient_temp)
slope = utils.read_int_from_file(self.psu_power_slope) * 1000
if ambient_temp < temp_warning_threshold:
power_warning_threshold = power_max_capacity - slope * 1000
else:
power_warning_threshold = power_max_capacity - (1000 + ambient_temp - temp_warning_threshold) * slope
if power_warning_threshold <= 0:
logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_warning_threshold, self.get_name()))
power_warning_threshold = 0
return float(power_warning_threshold) / 1000000
return None
def get_psu_power_critical_threshold(self):
"""
Retrieve the critical threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
The formula to calculate power critical threshold for a PSU
amb_temp = min(port_amb, fan_amb)
If amb_temp < ambient_temp_critical_threshold
threshold = max capacity
else
threshold = max capacity - slope*(amb_temp - ambient_temp_critical_threshold)
Returns:
A float number, the critical threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD)
if self.get_powergood_status():
if os.path.exists(self.psu_power_max_capacity):
power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity)
temp_critical_threshold = utils.read_int_from_file(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD)
fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP)
port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP)
ambient_temp = min(fan_ambient_temp, port_ambient_temp)
if ambient_temp < temp_critical_threshold:
power_critical_threshold = power_max_capacity
else:
slope = utils.read_int_from_file(self.psu_power_slope) * 1000
power_critical_threshold = power_max_capacity - (ambient_temp - temp_critical_threshold) * slope
if power_critical_threshold <= 0:
logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_critical_threshold, self.get_name()))
power_critical_threshold = 0
return float(power_critical_threshold) / 1000000
return None
class InvalidPsuVolWA:

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES.
# Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -172,24 +172,24 @@ class TestPsu:
psu.psu_power_max_capacity: 100000000,
psu.AMBIENT_TEMP_CRITICAL_THRESHOLD: 65000,
psu.AMBIENT_TEMP_WARNING_THRESHOLD: 55000,
psu.psu_power_slope: 2000
psu.psu_power_slope: 2
}
normal_data = {
psu.PORT_AMBIENT_TEMP: 55000,
psu.FAN_AMBIENT_TEMP: 50000,
'warning_threshold': 100.0,
'warning_threshold': 98.0,
'critical_threshold': 100.0
}
warning_data = {
psu.PORT_AMBIENT_TEMP: 65000,
psu.FAN_AMBIENT_TEMP: 60000,
'warning_threshold': 90.0,
'warning_threshold': 88.0,
'critical_threshold': 100.0
}
critical_data = {
psu.PORT_AMBIENT_TEMP: 70000,
psu.FAN_AMBIENT_TEMP: 75000,
'warning_threshold': 70.0,
'warning_threshold': 68.0,
'critical_threshold': 90.0
}
test_data = {}