From 80bf061b3762d11d836d87847c9227977d2b99e3 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 26 Mar 2020 01:54:07 +0800 Subject: [PATCH] [Mellanox] Fix thermal control bugs (#4298) * [thermal control] Fix pmon docker stop issue on 3800 * [thermal fix] Fix QA test issue * [thermal fix] change psu._get_power_available_status to psu.get_power_available_status * [thermal fix] adjust log for PSU absence and power absence * [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies * [thermal] fix fan.get_presence for non-removable SKU * [thermal fix] fix issue: fan direction is based on drawer * Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A * [thermal fix] add unit test for get_direction for absent FAN * Unplugable PSU has no FAN, no need add a FAN object for this PSU * Update submodules Co-authored-by: Stephen Sun <5379172+stephenxs@users.noreply.github.com> --- .../sonic_platform/chassis.py | 4 +- .../mlnx-platform-api/sonic_platform/fan.py | 34 ++++++--- .../mlnx-platform-api/sonic_platform/psu.py | 30 ++++++-- .../sonic_platform/thermal.py | 30 +++++--- .../sonic_platform/thermal_infos.py | 4 +- .../tests/duplicate_action.json | 18 +++++ .../tests/duplicate_condition.json | 17 +++++ .../mlnx-platform-api/tests/empty_action.json | 10 +++ .../tests/empty_condition.json | 11 +++ .../mlnx-platform-api/tests/mock_platform.py | 4 + .../tests/policy_with_same_conditions.json | 75 +++++++++++++++++++ .../mlnx-platform-api/tests/test_fan_api.py | 17 +++++ .../tests/test_thermal_policy.py | 46 ++++++++++++ src/sonic-platform-common | 2 +- src/sonic-platform-daemons | 2 +- 15 files changed, 270 insertions(+), 34 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_condition.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_fan_api.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 4c5e04b216..995d5cc8ae 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -110,9 +110,9 @@ class Chassis(ChassisBase): for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(has_fan_dir, index, index/2) + fan = Fan(has_fan_dir, index, index/2, False, self.sku_name) else: - fan = Fan(has_fan_dir, index, index) + fan = Fan(has_fan_dir, index, index, False, self.sku_name) self._fan_list.append(fan) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index cc4f8e81d9..9ce65d1e2f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -25,17 +25,22 @@ LED_PATH = "/var/run/hw-management/led/" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" +# SKUs with unplugable FANs: +# 1. don't have fanX_status and should be treated as always present +hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] + class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" - def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False): + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan + self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: @@ -47,7 +52,7 @@ class Fan(FanBase): else: self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) - self._name = 'psu_{}_fan_{}'.format(self.index, fan_index) + self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) @@ -80,13 +85,13 @@ class Fan(FanBase): 1 stands for forward, in other words intake 0 stands for reverse, in other words exhaust """ - if not self.fan_dir or self.is_psu_fan: + if not self.fan_dir or self.is_psu_fan or not self.get_presence(): return self.FAN_DIRECTION_NOT_APPLICABLE try: with open(os.path.join(self.fan_dir), 'r') as fan_dir: fan_dir_bits = int(fan_dir.read()) - fan_mask = 1 << self.index - 1 + fan_mask = 1 << self.drawer_index - 1 if fan_dir_bits & fan_mask: return self.FAN_DIRECTION_INTAKE else: @@ -107,15 +112,15 @@ class Fan(FanBase): """ status = 0 if self.is_psu_fan: - status = 1 + status = 0 else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: status = int(fault_status.read()) except (ValueError, IOError): - status = 0 + status = 1 - return status == 1 + return status == 0 def get_presence(self): @@ -132,11 +137,14 @@ class Fan(FanBase): else: status = 0 else: - try: - with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) - except (ValueError, IOError): - status = 0 + if self.always_presence: + status = 1 + else: + try: + with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: + status = int(presence_status.read()) + except (ValueError, IOError): + status = 0 return status == 1 @@ -183,6 +191,8 @@ class Fan(FanBase): max_speed_in_rpm = self._get_max_speed_in_rpm() speed = 100*speed_in_rpm/max_speed_in_rpm + if speed > 100: + speed = 100 return speed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 5f9d529f94..1dfcf54baf 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -99,19 +99,21 @@ class Psu(PsuBase): psu_presence = os.path.join(self.psu_path, psu_presence) self.psu_presence = psu_presence - fan = Fan(sku, psu_index, psu_index, True) - if fan.get_presence(): + # unplugable PSU has no FAN + if sku not in hwsku_dict_with_unplugable_psu: + fan = Fan(sku, psu_index, psu_index, True) self._fan_list.append(fan) - def get_name(self): - return self._name - self.psu_green_led_path = "led_psu_green" self.psu_red_led_path = "led_psu_red" self.psu_orange_led_path = "led_psu_orange" self.psu_led_cap_path = "led_psu_capability" + def get_name(self): + return self._name + + def _read_generic_file(self, filename, len): """ Read a generic file, returns the contents of the file @@ -287,3 +289,21 @@ class Psu(PsuBase): raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e))) return self.STATUS_LED_COLOR_OFF + + + def get_power_available_status(self): + """ + Gets the power available status + + Returns: + True if power is present and power on. + False and "absence of PSU" if power is not present. + False and "absence of power" if power is present but not power on. + """ + if not self.get_presence(): + return False, "absence of PSU" + elif not self.get_powergood_status(): + return False, "absence of power" + else: + return True, "" + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 424caa4b3c..a5faa5ea79 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -64,7 +64,8 @@ thermal_api_handler_psu = { } thermal_api_handler_gearbox = { THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", - THERMAL_API_GET_HIGH_THRESHOLD:None + THERMAL_API_GET_HIGH_THRESHOLD:None, + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_ambient_apis = { THERMAL_DEV_ASIC_AMBIENT : "asic", @@ -279,7 +280,7 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") + thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status) thermal_list.append(thermal) else: for index in range(count): @@ -289,7 +290,7 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): - def __init__(self, category, index, has_index, dependency = None, hint = None): + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories """ @@ -308,7 +309,6 @@ class Thermal(ThermalBase): self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency - self.dependent_hint = hint def get_name(self): @@ -360,13 +360,11 @@ class Thermal(ThermalBase): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None @@ -386,6 +384,11 @@ class Thermal(ThermalBase): """ if self.high_threshold is None: return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None @@ -405,6 +408,11 @@ class Thermal(ThermalBase): """ if self.high_critical_threshold is None: return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_critical_threshold, 0) if value_str is None: return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 34d31e47d2..82c186495f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -77,12 +77,12 @@ class PsuInfo(ThermalPolicyInfoBase): """ self._status_changed = False for psu in chassis.get_all_psus(): - if psu.get_presence() and psu not in self._presence_psus: + if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus: self._presence_psus.add(psu) self._status_changed = True if psu in self._absence_psus: self._absence_psus.remove(psu) - elif not psu.get_presence() and psu not in self._absence_psus: + elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus: self._absence_psus.add(psu) self._status_changed = True if psu in self._presence_psus: diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json new file mode 100644 index 0000000000..c19787aa26 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json @@ -0,0 +1,18 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json new file mode 100644 index 0000000000..c25d84762e --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json @@ -0,0 +1,17 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + }, + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_action.json b/platform/mellanox/mlnx-platform-api/tests/empty_action.json new file mode 100644 index 0000000000..b1051b5a6f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_action.json @@ -0,0 +1,10 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_condition.json b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json new file mode 100644 index 0000000000..e7a5884592 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json @@ -0,0 +1,11 @@ +{ + "name": "any fan absence", + "conditions": [ + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index b8d070d449..f34ace9796 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -13,10 +13,14 @@ class MockFan: class MockPsu: def __init__(self): self.presence = True + self.powergood = True def get_presence(self): return self.presence + def get_powergood_status(self): + return self.powergood + class MockChassis: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json new file mode 100644 index 0000000000..ace291be1c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json @@ -0,0 +1,75 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence 1", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py new file mode 100644 index 0000000000..381260163c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py @@ -0,0 +1,17 @@ +import os +import sys +from mock import MagicMock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.fan import Fan + + +def test_get_absence_fan_direction(): + fan = Fan(True, 0, 0) + fan.get_presence = MagicMock(return_value=False) + assert fan.fan_dir is not None + assert not fan.is_psu_fan + assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index ba9e502d4f..843244e937 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -66,6 +66,12 @@ def test_psu_info(): assert len(psu_info.get_presence_psus()) == 1 assert psu_info.is_status_changed() + psu_list[0].powergood = False + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + def test_fan_policy(thermal_manager): chassis = MockChassis() @@ -269,4 +275,44 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_duplicate_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_duplicate_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_policy_with_same_conditions(): + from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase + class MockThermalManager(ThermalManagerBase): + pass + + with pytest.raises(Exception): + MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) + diff --git a/src/sonic-platform-common b/src/sonic-platform-common index ed50e72d02..dc59b105ff 160000 --- a/src/sonic-platform-common +++ b/src/sonic-platform-common @@ -1 +1 @@ -Subproject commit ed50e72d028092399e2768e64a7a4ef01e7571de +Subproject commit dc59b105ff234bd89b9042c934b17c10b9b261f7 diff --git a/src/sonic-platform-daemons b/src/sonic-platform-daemons index fc455a7d01..97e40cefae 160000 --- a/src/sonic-platform-daemons +++ b/src/sonic-platform-daemons @@ -1 +1 @@ -Subproject commit fc455a7d01f8df1ed6a55960056facdf1b3b0b3c +Subproject commit 97e40cefae31928fe8faf0a0995c8263e7f62465