[Mellanox] Fix thermal control bugs (#4298)
* [thermal control] Fix pmon docker stop issue on 3800 * [thermal fix] Fix QA test issue * [thermal fix] change psu._get_power_available_status to psu.get_power_available_status * [thermal fix] adjust log for PSU absence and power absence * [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies * [thermal] fix fan.get_presence for non-removable SKU * [thermal fix] fix issue: fan direction is based on drawer * Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A * [thermal fix] add unit test for get_direction for absent FAN * Unplugable PSU has no FAN, no need add a FAN object for this PSU * Update submodules Co-authored-by: Stephen Sun <5379172+stephenxs@users.noreply.github.com>
This commit is contained in:
parent
d2bab4424b
commit
80bf061b37
@ -110,9 +110,9 @@ class Chassis(ChassisBase):
|
||||
|
||||
for index in range(num_of_fan):
|
||||
if multi_rotor_in_drawer:
|
||||
fan = Fan(has_fan_dir, index, index/2)
|
||||
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
|
||||
else:
|
||||
fan = Fan(has_fan_dir, index, index)
|
||||
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
|
||||
self._fan_list.append(fan)
|
||||
|
||||
|
||||
|
@ -25,17 +25,22 @@ LED_PATH = "/var/run/hw-management/led/"
|
||||
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
|
||||
FAN_DIR = "/var/run/hw-management/system/fan_dir"
|
||||
|
||||
# SKUs with unplugable FANs:
|
||||
# 1. don't have fanX_status and should be treated as always present
|
||||
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']
|
||||
|
||||
class Fan(FanBase):
|
||||
"""Platform-specific Fan class"""
|
||||
|
||||
STATUS_LED_COLOR_ORANGE = "orange"
|
||||
|
||||
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
|
||||
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
|
||||
# API index is starting from 0, Mellanox platform index is starting from 1
|
||||
self.index = fan_index + 1
|
||||
self.drawer_index = drawer_index + 1
|
||||
|
||||
self.is_psu_fan = psu_fan
|
||||
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True
|
||||
|
||||
self.fan_min_speed_path = "fan{}_min".format(self.index)
|
||||
if not self.is_psu_fan:
|
||||
@ -47,7 +52,7 @@ class Fan(FanBase):
|
||||
else:
|
||||
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
|
||||
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
|
||||
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
|
||||
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
|
||||
self.fan_max_speed_path = None
|
||||
self.fan_status_path = "fan{}_fault".format(self.index)
|
||||
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
|
||||
@ -80,13 +85,13 @@ class Fan(FanBase):
|
||||
1 stands for forward, in other words intake
|
||||
0 stands for reverse, in other words exhaust
|
||||
"""
|
||||
if not self.fan_dir or self.is_psu_fan:
|
||||
if not self.fan_dir or self.is_psu_fan or not self.get_presence():
|
||||
return self.FAN_DIRECTION_NOT_APPLICABLE
|
||||
|
||||
try:
|
||||
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
|
||||
fan_dir_bits = int(fan_dir.read())
|
||||
fan_mask = 1 << self.index - 1
|
||||
fan_mask = 1 << self.drawer_index - 1
|
||||
if fan_dir_bits & fan_mask:
|
||||
return self.FAN_DIRECTION_INTAKE
|
||||
else:
|
||||
@ -107,15 +112,15 @@ class Fan(FanBase):
|
||||
"""
|
||||
status = 0
|
||||
if self.is_psu_fan:
|
||||
status = 1
|
||||
status = 0
|
||||
else:
|
||||
try:
|
||||
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
|
||||
status = int(fault_status.read())
|
||||
except (ValueError, IOError):
|
||||
status = 0
|
||||
status = 1
|
||||
|
||||
return status == 1
|
||||
return status == 0
|
||||
|
||||
|
||||
def get_presence(self):
|
||||
@ -131,6 +136,9 @@ class Fan(FanBase):
|
||||
status = 1
|
||||
else:
|
||||
status = 0
|
||||
else:
|
||||
if self.always_presence:
|
||||
status = 1
|
||||
else:
|
||||
try:
|
||||
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
|
||||
@ -183,6 +191,8 @@ class Fan(FanBase):
|
||||
|
||||
max_speed_in_rpm = self._get_max_speed_in_rpm()
|
||||
speed = 100*speed_in_rpm/max_speed_in_rpm
|
||||
if speed > 100:
|
||||
speed = 100
|
||||
|
||||
return speed
|
||||
|
||||
|
@ -99,19 +99,21 @@ class Psu(PsuBase):
|
||||
psu_presence = os.path.join(self.psu_path, psu_presence)
|
||||
self.psu_presence = psu_presence
|
||||
|
||||
# unplugable PSU has no FAN
|
||||
if sku not in hwsku_dict_with_unplugable_psu:
|
||||
fan = Fan(sku, psu_index, psu_index, True)
|
||||
if fan.get_presence():
|
||||
self._fan_list.append(fan)
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
|
||||
self.psu_green_led_path = "led_psu_green"
|
||||
self.psu_red_led_path = "led_psu_red"
|
||||
self.psu_orange_led_path = "led_psu_orange"
|
||||
self.psu_led_cap_path = "led_psu_capability"
|
||||
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
|
||||
|
||||
def _read_generic_file(self, filename, len):
|
||||
"""
|
||||
Read a generic file, returns the contents of the file
|
||||
@ -287,3 +289,21 @@ class Psu(PsuBase):
|
||||
raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e)))
|
||||
|
||||
return self.STATUS_LED_COLOR_OFF
|
||||
|
||||
|
||||
def get_power_available_status(self):
|
||||
"""
|
||||
Gets the power available status
|
||||
|
||||
Returns:
|
||||
True if power is present and power on.
|
||||
False and "absence of PSU" if power is not present.
|
||||
False and "absence of power" if power is present but not power on.
|
||||
"""
|
||||
if not self.get_presence():
|
||||
return False, "absence of PSU"
|
||||
elif not self.get_powergood_status():
|
||||
return False, "absence of power"
|
||||
else:
|
||||
return True, ""
|
||||
|
||||
|
@ -64,7 +64,8 @@ thermal_api_handler_psu = {
|
||||
}
|
||||
thermal_api_handler_gearbox = {
|
||||
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
|
||||
THERMAL_API_GET_HIGH_THRESHOLD:None
|
||||
THERMAL_API_GET_HIGH_THRESHOLD:None,
|
||||
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
|
||||
}
|
||||
thermal_ambient_apis = {
|
||||
THERMAL_DEV_ASIC_AMBIENT : "asic",
|
||||
@ -279,7 +280,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
|
||||
else:
|
||||
if category == THERMAL_DEV_CATEGORY_PSU:
|
||||
for index in range(count):
|
||||
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off")
|
||||
thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
|
||||
thermal_list.append(thermal)
|
||||
else:
|
||||
for index in range(count):
|
||||
@ -289,7 +290,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
|
||||
|
||||
|
||||
class Thermal(ThermalBase):
|
||||
def __init__(self, category, index, has_index, dependency = None, hint = None):
|
||||
def __init__(self, category, index, has_index, dependency = None):
|
||||
"""
|
||||
index should be a string for category ambient and int for other categories
|
||||
"""
|
||||
@ -308,7 +309,6 @@ class Thermal(ThermalBase):
|
||||
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
|
||||
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
|
||||
self.dependency = dependency
|
||||
self.dependent_hint = hint
|
||||
|
||||
|
||||
def get_name(self):
|
||||
@ -360,12 +360,10 @@ class Thermal(ThermalBase):
|
||||
A float number of current temperature in Celsius up to nearest thousandth
|
||||
of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
if self.dependency and not self.dependency():
|
||||
if self.dependent_hint:
|
||||
hint = self.dependent_hint
|
||||
else:
|
||||
hint = "unknown reason"
|
||||
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
|
||||
if self.dependency:
|
||||
status, hint = self.dependency()
|
||||
if not status:
|
||||
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
|
||||
return None
|
||||
value_str = self._read_generic_file(self.temperature, 0)
|
||||
if value_str is None:
|
||||
@ -386,6 +384,11 @@ class Thermal(ThermalBase):
|
||||
"""
|
||||
if self.high_threshold is None:
|
||||
return None
|
||||
if self.dependency:
|
||||
status, hint = self.dependency()
|
||||
if not status:
|
||||
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
|
||||
return None
|
||||
value_str = self._read_generic_file(self.high_threshold, 0)
|
||||
if value_str is None:
|
||||
return None
|
||||
@ -405,6 +408,11 @@ class Thermal(ThermalBase):
|
||||
"""
|
||||
if self.high_critical_threshold is None:
|
||||
return None
|
||||
if self.dependency:
|
||||
status, hint = self.dependency()
|
||||
if not status:
|
||||
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
|
||||
return None
|
||||
value_str = self._read_generic_file(self.high_critical_threshold, 0)
|
||||
if value_str is None:
|
||||
return None
|
||||
|
@ -77,12 +77,12 @@ class PsuInfo(ThermalPolicyInfoBase):
|
||||
"""
|
||||
self._status_changed = False
|
||||
for psu in chassis.get_all_psus():
|
||||
if psu.get_presence() and psu not in self._presence_psus:
|
||||
if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus:
|
||||
self._presence_psus.add(psu)
|
||||
self._status_changed = True
|
||||
if psu in self._absence_psus:
|
||||
self._absence_psus.remove(psu)
|
||||
elif not psu.get_presence() and psu not in self._absence_psus:
|
||||
elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus:
|
||||
self._absence_psus.add(psu)
|
||||
self._status_changed = True
|
||||
if psu in self._presence_psus:
|
||||
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "any fan absence",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "fan.any.absence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
},
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
}
|
||||
]
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
{
|
||||
"name": "any fan absence",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "fan.any.absence"
|
||||
},
|
||||
{
|
||||
"type": "fan.any.absence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
}
|
||||
]
|
||||
}
|
10
platform/mellanox/mlnx-platform-api/tests/empty_action.json
Normal file
10
platform/mellanox/mlnx-platform-api/tests/empty_action.json
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"name": "any fan absence",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "fan.any.absence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
]
|
||||
}
|
@ -0,0 +1,11 @@
|
||||
{
|
||||
"name": "any fan absence",
|
||||
"conditions": [
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
}
|
||||
]
|
||||
}
|
@ -13,10 +13,14 @@ class MockFan:
|
||||
class MockPsu:
|
||||
def __init__(self):
|
||||
self.presence = True
|
||||
self.powergood = True
|
||||
|
||||
def get_presence(self):
|
||||
return self.presence
|
||||
|
||||
def get_powergood_status(self):
|
||||
return self.powergood
|
||||
|
||||
|
||||
class MockChassis:
|
||||
def __init__(self):
|
||||
|
@ -0,0 +1,75 @@
|
||||
{
|
||||
"thermal_control_algorithm": {
|
||||
"run_at_boot_up": "false",
|
||||
"fan_speed_when_suspend": "60"
|
||||
},
|
||||
"info_types": [
|
||||
{
|
||||
"type": "fan_info"
|
||||
},
|
||||
{
|
||||
"type": "psu_info"
|
||||
},
|
||||
{
|
||||
"type": "chassis_info"
|
||||
}
|
||||
],
|
||||
"policies": [
|
||||
{
|
||||
"name": "all fan and psu presence",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "fan.all.presence"
|
||||
},
|
||||
{
|
||||
"type": "psu.all.presence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "thermal_control.control",
|
||||
"status": "false"
|
||||
},
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "any psu absence",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "psu.any.absence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "thermal_control.control",
|
||||
"status": "false"
|
||||
},
|
||||
{
|
||||
"type": "fan.all.set_speed",
|
||||
"speed": "100"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "all fan and psu presence 1",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "fan.all.presence"
|
||||
},
|
||||
{
|
||||
"type": "psu.all.presence"
|
||||
}
|
||||
],
|
||||
"actions": [
|
||||
{
|
||||
"type": "thermal_control.control",
|
||||
"status": "true"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
17
platform/mellanox/mlnx-platform-api/tests/test_fan_api.py
Normal file
17
platform/mellanox/mlnx-platform-api/tests/test_fan_api.py
Normal file
@ -0,0 +1,17 @@
|
||||
import os
|
||||
import sys
|
||||
from mock import MagicMock
|
||||
|
||||
test_path = os.path.dirname(os.path.abspath(__file__))
|
||||
modules_path = os.path.dirname(test_path)
|
||||
sys.path.insert(0, modules_path)
|
||||
|
||||
from sonic_platform.fan import Fan
|
||||
|
||||
|
||||
def test_get_absence_fan_direction():
|
||||
fan = Fan(True, 0, 0)
|
||||
fan.get_presence = MagicMock(return_value=False)
|
||||
assert fan.fan_dir is not None
|
||||
assert not fan.is_psu_fan
|
||||
assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE
|
@ -66,6 +66,12 @@ def test_psu_info():
|
||||
assert len(psu_info.get_presence_psus()) == 1
|
||||
assert psu_info.is_status_changed()
|
||||
|
||||
psu_list[0].powergood = False
|
||||
psu_info.collect(chassis)
|
||||
assert len(psu_info.get_absence_psus()) == 1
|
||||
assert len(psu_info.get_presence_psus()) == 0
|
||||
assert psu_info.is_status_changed()
|
||||
|
||||
|
||||
def test_fan_policy(thermal_manager):
|
||||
chassis = MockChassis()
|
||||
@ -269,4 +275,44 @@ def test_load_control_thermal_algo_action():
|
||||
with pytest.raises(ValueError):
|
||||
action.load_from_json(json_obj)
|
||||
|
||||
def test_load_duplicate_condition():
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
||||
with open(os.path.join(test_path, 'duplicate_condition.json')) as f:
|
||||
json_obj = json.load(f)
|
||||
policy = ThermalPolicy()
|
||||
with pytest.raises(Exception):
|
||||
policy.load_from_json(json_obj)
|
||||
|
||||
def test_load_duplicate_action():
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
||||
with open(os.path.join(test_path, 'duplicate_action.json')) as f:
|
||||
json_obj = json.load(f)
|
||||
policy = ThermalPolicy()
|
||||
with pytest.raises(Exception):
|
||||
policy.load_from_json(json_obj)
|
||||
|
||||
def test_load_empty_condition():
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
||||
with open(os.path.join(test_path, 'empty_condition.json')) as f:
|
||||
json_obj = json.load(f)
|
||||
policy = ThermalPolicy()
|
||||
with pytest.raises(Exception):
|
||||
policy.load_from_json(json_obj)
|
||||
|
||||
def test_load_empty_action():
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
||||
with open(os.path.join(test_path, 'empty_action.json')) as f:
|
||||
json_obj = json.load(f)
|
||||
policy = ThermalPolicy()
|
||||
with pytest.raises(Exception):
|
||||
policy.load_from_json(json_obj)
|
||||
|
||||
def test_load_policy_with_same_conditions():
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
|
||||
class MockThermalManager(ThermalManagerBase):
|
||||
pass
|
||||
|
||||
with pytest.raises(Exception):
|
||||
MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json'))
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit ed50e72d028092399e2768e64a7a4ef01e7571de
|
||||
Subproject commit dc59b105ff234bd89b9042c934b17c10b9b261f7
|
@ -1 +1 @@
|
||||
Subproject commit fc455a7d01f8df1ed6a55960056facdf1b3b0b3c
|
||||
Subproject commit 97e40cefae31928fe8faf0a0995c8263e7f62465
|
Loading…
Reference in New Issue
Block a user