[Mellanox] Fix thermal control bugs (#4298)

* [thermal control] Fix pmon docker stop issue on 3800
* [thermal fix] Fix QA test issue
* [thermal fix] change psu._get_power_available_status to psu.get_power_available_status
* [thermal fix] adjust log for PSU absence and power absence
* [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies
* [thermal] fix fan.get_presence for non-removable SKU
* [thermal fix] fix issue: fan direction is based on drawer
* Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A
* [thermal fix] add unit test for get_direction for absent FAN
* Unplugable PSU has no FAN, no need add a FAN object for this PSU
* Update submodules

Co-authored-by: Stephen Sun <5379172+stephenxs@users.noreply.github.com>
This commit is contained in:
Junchao-Mellanox 2020-03-26 01:54:07 +08:00 committed by GitHub
parent d2bab4424b
commit 80bf061b37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 270 additions and 34 deletions

View File

@ -110,9 +110,9 @@ class Chassis(ChassisBase):
for index in range(num_of_fan): for index in range(num_of_fan):
if multi_rotor_in_drawer: if multi_rotor_in_drawer:
fan = Fan(has_fan_dir, index, index/2) fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
else: else:
fan = Fan(has_fan_dir, index, index) fan = Fan(has_fan_dir, index, index, False, self.sku_name)
self._fan_list.append(fan) self._fan_list.append(fan)

View File

@ -25,17 +25,22 @@ LED_PATH = "/var/run/hw-management/led/"
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir" FAN_DIR = "/var/run/hw-management/system/fan_dir"
# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']
class Fan(FanBase): class Fan(FanBase):
"""Platform-specific Fan class""" """Platform-specific Fan class"""
STATUS_LED_COLOR_ORANGE = "orange" STATUS_LED_COLOR_ORANGE = "orange"
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False): def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
# API index is starting from 0, Mellanox platform index is starting from 1 # API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1 self.index = fan_index + 1
self.drawer_index = drawer_index + 1 self.drawer_index = drawer_index + 1
self.is_psu_fan = psu_fan self.is_psu_fan = psu_fan
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True
self.fan_min_speed_path = "fan{}_min".format(self.index) self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan: if not self.is_psu_fan:
@ -47,7 +52,7 @@ class Fan(FanBase):
else: else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index) self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index) self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
@ -80,13 +85,13 @@ class Fan(FanBase):
1 stands for forward, in other words intake 1 stands for forward, in other words intake
0 stands for reverse, in other words exhaust 0 stands for reverse, in other words exhaust
""" """
if not self.fan_dir or self.is_psu_fan: if not self.fan_dir or self.is_psu_fan or not self.get_presence():
return self.FAN_DIRECTION_NOT_APPLICABLE return self.FAN_DIRECTION_NOT_APPLICABLE
try: try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir: with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read()) fan_dir_bits = int(fan_dir.read())
fan_mask = 1 << self.index - 1 fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask: if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE return self.FAN_DIRECTION_INTAKE
else: else:
@ -107,15 +112,15 @@ class Fan(FanBase):
""" """
status = 0 status = 0
if self.is_psu_fan: if self.is_psu_fan:
status = 1 status = 0
else: else:
try: try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read()) status = int(fault_status.read())
except (ValueError, IOError): except (ValueError, IOError):
status = 0 status = 1
return status == 1 return status == 0
def get_presence(self): def get_presence(self):
@ -131,6 +136,9 @@ class Fan(FanBase):
status = 1 status = 1
else: else:
status = 0 status = 0
else:
if self.always_presence:
status = 1
else: else:
try: try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
@ -183,6 +191,8 @@ class Fan(FanBase):
max_speed_in_rpm = self._get_max_speed_in_rpm() max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm speed = 100*speed_in_rpm/max_speed_in_rpm
if speed > 100:
speed = 100
return speed return speed

View File

@ -99,19 +99,21 @@ class Psu(PsuBase):
psu_presence = os.path.join(self.psu_path, psu_presence) psu_presence = os.path.join(self.psu_path, psu_presence)
self.psu_presence = psu_presence self.psu_presence = psu_presence
# unplugable PSU has no FAN
if sku not in hwsku_dict_with_unplugable_psu:
fan = Fan(sku, psu_index, psu_index, True) fan = Fan(sku, psu_index, psu_index, True)
if fan.get_presence():
self._fan_list.append(fan) self._fan_list.append(fan)
def get_name(self):
return self._name
self.psu_green_led_path = "led_psu_green" self.psu_green_led_path = "led_psu_green"
self.psu_red_led_path = "led_psu_red" self.psu_red_led_path = "led_psu_red"
self.psu_orange_led_path = "led_psu_orange" self.psu_orange_led_path = "led_psu_orange"
self.psu_led_cap_path = "led_psu_capability" self.psu_led_cap_path = "led_psu_capability"
def get_name(self):
return self._name
def _read_generic_file(self, filename, len): def _read_generic_file(self, filename, len):
""" """
Read a generic file, returns the contents of the file Read a generic file, returns the contents of the file
@ -287,3 +289,21 @@ class Psu(PsuBase):
raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e))) raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e)))
return self.STATUS_LED_COLOR_OFF return self.STATUS_LED_COLOR_OFF
def get_power_available_status(self):
"""
Gets the power available status
Returns:
True if power is present and power on.
False and "absence of PSU" if power is not present.
False and "absence of power" if power is present but not power on.
"""
if not self.get_presence():
return False, "absence of PSU"
elif not self.get_powergood_status():
return False, "absence of power"
else:
return True, ""

View File

@ -64,7 +64,8 @@ thermal_api_handler_psu = {
} }
thermal_api_handler_gearbox = { thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:None THERMAL_API_GET_HIGH_THRESHOLD:None,
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
} }
thermal_ambient_apis = { thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic", THERMAL_DEV_ASIC_AMBIENT : "asic",
@ -279,7 +280,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
else: else:
if category == THERMAL_DEV_CATEGORY_PSU: if category == THERMAL_DEV_CATEGORY_PSU:
for index in range(count): for index in range(count):
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
thermal_list.append(thermal) thermal_list.append(thermal)
else: else:
for index in range(count): for index in range(count):
@ -289,7 +290,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
class Thermal(ThermalBase): class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None): def __init__(self, category, index, has_index, dependency = None):
""" """
index should be a string for category ambient and int for other categories index should be a string for category ambient and int for other categories
""" """
@ -308,7 +309,6 @@ class Thermal(ThermalBase):
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency self.dependency = dependency
self.dependent_hint = hint
def get_name(self): def get_name(self):
@ -360,12 +360,10 @@ class Thermal(ThermalBase):
A float number of current temperature in Celsius up to nearest thousandth A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125 of one degree Celsius, e.g. 30.125
""" """
if self.dependency and not self.dependency(): if self.dependency:
if self.dependent_hint: status, hint = self.dependency()
hint = self.dependent_hint if not status:
else: logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
hint = "unknown reason"
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
return None return None
value_str = self._read_generic_file(self.temperature, 0) value_str = self._read_generic_file(self.temperature, 0)
if value_str is None: if value_str is None:
@ -386,6 +384,11 @@ class Thermal(ThermalBase):
""" """
if self.high_threshold is None: if self.high_threshold is None:
return None return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_threshold, 0) value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None: if value_str is None:
return None return None
@ -405,6 +408,11 @@ class Thermal(ThermalBase):
""" """
if self.high_critical_threshold is None: if self.high_critical_threshold is None:
return None return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0) value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None: if value_str is None:
return None return None

View File

@ -77,12 +77,12 @@ class PsuInfo(ThermalPolicyInfoBase):
""" """
self._status_changed = False self._status_changed = False
for psu in chassis.get_all_psus(): for psu in chassis.get_all_psus():
if psu.get_presence() and psu not in self._presence_psus: if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus:
self._presence_psus.add(psu) self._presence_psus.add(psu)
self._status_changed = True self._status_changed = True
if psu in self._absence_psus: if psu in self._absence_psus:
self._absence_psus.remove(psu) self._absence_psus.remove(psu)
elif not psu.get_presence() and psu not in self._absence_psus: elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus:
self._absence_psus.add(psu) self._absence_psus.add(psu)
self._status_changed = True self._status_changed = True
if psu in self._presence_psus: if psu in self._presence_psus:

View File

@ -0,0 +1,18 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}

View File

@ -0,0 +1,17 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
},
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}

View File

@ -0,0 +1,10 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
]
}

View File

@ -0,0 +1,11 @@
{
"name": "any fan absence",
"conditions": [
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}

View File

@ -13,10 +13,14 @@ class MockFan:
class MockPsu: class MockPsu:
def __init__(self): def __init__(self):
self.presence = True self.presence = True
self.powergood = True
def get_presence(self): def get_presence(self):
return self.presence return self.presence
def get_powergood_status(self):
return self.powergood
class MockChassis: class MockChassis:
def __init__(self): def __init__(self):

View File

@ -0,0 +1,75 @@
{
"thermal_control_algorithm": {
"run_at_boot_up": "false",
"fan_speed_when_suspend": "60"
},
"info_types": [
{
"type": "fan_info"
},
{
"type": "psu_info"
},
{
"type": "chassis_info"
}
],
"policies": [
{
"name": "all fan and psu presence",
"conditions": [
{
"type": "fan.all.presence"
},
{
"type": "psu.all.presence"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
},
{
"name": "any psu absence",
"conditions": [
{
"type": "psu.any.absence"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
},
{
"name": "all fan and psu presence 1",
"conditions": [
{
"type": "fan.all.presence"
},
{
"type": "psu.all.presence"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "true"
}
]
}
]
}

View File

@ -0,0 +1,17 @@
import os
import sys
from mock import MagicMock
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from sonic_platform.fan import Fan
def test_get_absence_fan_direction():
fan = Fan(True, 0, 0)
fan.get_presence = MagicMock(return_value=False)
assert fan.fan_dir is not None
assert not fan.is_psu_fan
assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE

View File

@ -66,6 +66,12 @@ def test_psu_info():
assert len(psu_info.get_presence_psus()) == 1 assert len(psu_info.get_presence_psus()) == 1
assert psu_info.is_status_changed() assert psu_info.is_status_changed()
psu_list[0].powergood = False
psu_info.collect(chassis)
assert len(psu_info.get_absence_psus()) == 1
assert len(psu_info.get_presence_psus()) == 0
assert psu_info.is_status_changed()
def test_fan_policy(thermal_manager): def test_fan_policy(thermal_manager):
chassis = MockChassis() chassis = MockChassis()
@ -269,4 +275,44 @@ def test_load_control_thermal_algo_action():
with pytest.raises(ValueError): with pytest.raises(ValueError):
action.load_from_json(json_obj) action.load_from_json(json_obj)
def test_load_duplicate_condition():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'duplicate_condition.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)
def test_load_duplicate_action():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'duplicate_action.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)
def test_load_empty_condition():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'empty_condition.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)
def test_load_empty_action():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'empty_action.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)
def test_load_policy_with_same_conditions():
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
class MockThermalManager(ThermalManagerBase):
pass
with pytest.raises(Exception):
MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json'))

@ -1 +1 @@
Subproject commit ed50e72d028092399e2768e64a7a4ef01e7571de Subproject commit dc59b105ff234bd89b9042c934b17c10b9b261f7

@ -1 +1 @@
Subproject commit fc455a7d01f8df1ed6a55960056facdf1b3b0b3c Subproject commit 97e40cefae31928fe8faf0a0995c8263e7f62465