diff --git a/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json new file mode 100644 index 0000000000..1e23d6c8b2 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -0,0 +1,80 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "true", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + }, + { + "type": "fan.all.good" + } + ], + "actions": [ + { + "type": "thermal.recover" + } + ] + } + ] +} \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json new file mode 120000 index 0000000000..5a25cd87f7 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index 61374af7d5..2ce609fff1 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -18,7 +18,8 @@ RUN apt-get update && \ rrdtool \ python-smbus \ ethtool \ - dmidecode && \ + dmidecode \ + i2c-tools && \ pip install enum34 {% if docker_platform_monitor_debs.strip() -%} diff --git a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 index 5d848776f2..d33b4e7c3f 100644 --- a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 +++ b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 @@ -91,3 +91,14 @@ stdout_logfile=syslog stderr_logfile=syslog startsecs=10 {% endif %} + +{% if not skip_thermalctld %} +[program:thermalctld] +command=/usr/bin/thermalctld +priority=9 +autostart=false +autorestart=true +stdout_logfile=syslog +stderr_logfile=syslog +startsecs=0 +{% endif %} diff --git a/dockers/docker-platform-monitor/start.sh.j2 b/dockers/docker-platform-monitor/start.sh.j2 index 5b4fe45888..03e0b49b8c 100644 --- a/dockers/docker-platform-monitor/start.sh.j2 +++ b/dockers/docker-platform-monitor/start.sh.j2 @@ -75,3 +75,7 @@ supervisorctl start psud supervisorctl start syseepromd {% endif %} +{% if not skip_thermalctld %} +supervisorctl start thermalctld +{% endif %} + diff --git a/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch b/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch deleted file mode 100644 index a72c94473e..0000000000 --- a/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch +++ /dev/null @@ -1,53 +0,0 @@ -From ebb17bd1f6996f73cb67313846a63c789e74c4f4 Mon Sep 17 00:00:00 2001 -From: Mykola Faryma -Date: Fri, 21 Feb 2020 12:28:54 +0200 -Subject: [PATCH 1/1] Make hw-mgmt SimX compatiable - -Signed-off-by: Mykola Faryma ---- - usr/usr/bin/hw-management.sh | 29 +++++++++++++++++++++++++++++ - 1 file changed, 29 insertions(+) - -diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh -index 1b5b18a..3dfd4b1 100755 ---- a/usr/usr/bin/hw-management.sh -+++ b/usr/usr/bin/hw-management.sh -@@ -943,6 +943,35 @@ do_chip_down() - /usr/bin/hw-management-thermal-events.sh change hotplug_asic down %S %p - } - -+handle_simx() -+{ -+ local -r onie_platform="$(cat /host/machine.conf | grep onie_platform | cut -d= -f2)" -+ -+ local -r syseeprom_cache_path="/var/cache/sonic/decode-syseeprom/syseeprom_cache" -+ local -r syseeprom_hex_path="/usr/share/sonic/device/${onie_platform}/syseeprom.hex" -+ local -r syseeprom_vpd_path="/var/run/hw-management/eeprom/vpd_info" -+ -+ case $ACTION in -+ start) -+ /bin/bash -c "/bin/rm -f ${syseeprom_cache_path}" -+ /bin/bash -c "/bin/mkdir -p ${eeprom_path}" -+ /bin/bash -c "/usr/bin/xxd -r -p ${syseeprom_hex_path} ${syseeprom_vpd_path}" -+ ;; -+ stop) -+ /bin/bash -c "/bin/rm -fr ${hw_management_path}" -+ ;; -+ *) -+ echo "Usage: `basename $0` {start|stop}" -+ exit 1 -+ ;; -+ esac -+} -+ -+if [[ "$(cat /sys/devices/virtual/dmi/id/sys_vendor)" = "QEMU" ]]; then -+ handle_simx -+ exit 0 -+fi -+ - case $ACTION in - start) - if [ -d /var/run/hw-management ]; then --- -1.9.1 - diff --git a/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch b/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch new file mode 100644 index 0000000000..2bdadebcd0 --- /dev/null +++ b/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch @@ -0,0 +1,27 @@ +From 3512488c981eb81d51ce92cb3573721e36861f56 Mon Sep 17 00:00:00 2001 +From: Junchao Chen +Date: Fri, 29 May 2020 10:38:53 +0300 +Subject: [PATCH] Disable hw-management thermal control service + +--- + usr/usr/bin/hw-management.sh | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh +index 65e5d39..0d1c4a1 100755 +--- a/usr/usr/bin/hw-management.sh ++++ b/usr/usr/bin/hw-management.sh +@@ -832,7 +832,9 @@ do_start() + if [ -f $config_path/max_tachos ]; then + max_tachos=$(<$config_path/max_tachos) + fi +- $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& ++ # Disable hw-management thermal control because ++ # SONiC already implement it ++ #$THERMAL_CONTROL $thermal_type $max_tachos $max_psus& + } + + do_stop() +-- +1.9.1 + diff --git a/platform/mellanox/mlnx-platform-api.mk b/platform/mellanox/mlnx-platform-api.mk index 4b70e59deb..7bbbc3c70b 100644 --- a/platform/mellanox/mlnx-platform-api.mk +++ b/platform/mellanox/mlnx-platform-api.mk @@ -3,6 +3,7 @@ SONIC_PLATFORM_API_PY2 = mlnx_platform_api-1.0-py2-none-any.whl $(SONIC_PLATFORM_API_PY2)_SRC_PATH = $(PLATFORM_PATH)/mlnx-platform-api $(SONIC_PLATFORM_API_PY2)_PYTHON_VERSION = 2 +$(SONIC_PLATFORM_API_PY2)_DEPENDS = $(SONIC_PLATFORM_COMMON_PY2) $(SONIC_DAEMON_BASE_PY2) $(SONIC_CONFIG_ENGINE) SONIC_PYTHON_WHEELS += $(SONIC_PLATFORM_API_PY2) export mlnx_platform_api_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2))" diff --git a/platform/mellanox/mlnx-platform-api/.gitignore b/platform/mellanox/mlnx-platform-api/.gitignore new file mode 100644 index 0000000000..07f8a98e1f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/.gitignore @@ -0,0 +1,2 @@ +*.pyc +.cache/ diff --git a/platform/mellanox/mlnx-platform-api/pytest.ini b/platform/mellanox/mlnx-platform-api/pytest.ini new file mode 100644 index 0000000000..c24fe5bb9e --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning diff --git a/platform/mellanox/mlnx-platform-api/setup.cfg b/platform/mellanox/mlnx-platform-api/setup.cfg new file mode 100644 index 0000000000..b7e478982c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/platform/mellanox/mlnx-platform-api/setup.py b/platform/mellanox/mlnx-platform-api/setup.py index 12809c4085..f10f84924d 100644 --- a/platform/mellanox/mlnx-platform-api/setup.py +++ b/platform/mellanox/mlnx-platform-api/setup.py @@ -12,6 +12,14 @@ setup( maintainer_email='kevinw@mellanox.com', packages=[ 'sonic_platform', + 'tests' + ], + setup_requires= [ + 'pytest-runner' + ], + tests_require = [ + 'pytest', + 'mock>=2.0.0' ], classifiers=[ 'Development Status :: 3 - Alpha', @@ -26,5 +34,6 @@ setup( 'Topic :: Utilities', ], keywords='sonic SONiC platform PLATFORM', + test_suite='setup.get_test_suite' ) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py b/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py index d94d4c9ec8..d82f374931 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py @@ -1,2 +1,2 @@ __all__ = ["platform", "chassis"] -from sonic_platform import * \ No newline at end of file +from sonic_platform import * diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index c693b93462..5ecf3c150d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -15,6 +15,7 @@ try: from sonic_daemon_base.daemon_base import Logger from os import listdir from os.path import isfile, join + from glob import glob import sys import io import re @@ -28,12 +29,17 @@ MAX_SELECT_DELAY = 3600 MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" +GET_PLATFORM_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.platform" EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom' EEPROM_CACHE_FILE = 'syseeprom_cache' HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/' +MST_DEVICE_NAME_PATTERN = '/dev/mst/mt[0-9]*_pciconf0' +MST_DEVICE_RE_PATTERN = '/dev/mst/mt([0-9]*)_pciconf0' +SPECTRUM1_CHIP_ID = '52100' + #reboot cause related definitions REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT @@ -55,6 +61,7 @@ class Chassis(ChassisBase): # Initialize SKU name self.sku_name = self._get_sku_name() + self.platform_name = self._get_platform_name() mi = get_machine_info() if mi is not None: self.name = mi['onie_platform'] @@ -93,11 +100,21 @@ class Chassis(ChassisBase): num_of_fan, num_of_drawer = self._extract_num_of_fans_and_fan_drawers() multi_rotor_in_drawer = num_of_fan > num_of_drawer + # Fan's direction isn't supported on spectrum 1 devices for now + mst_dev_list = glob(MST_DEVICE_NAME_PATTERN) + if not mst_dev_list: + raise RuntimeError("Can't get chip type due to {} not found".format(MST_DEVICE_NAME_PATTERN)) + m = re.search(MST_DEVICE_RE_PATTERN, mst_dev_list[0]) + if m.group(1) == SPECTRUM1_CHIP_ID: + has_fan_dir = False + else: + has_fan_dir = True + for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(index, index/2) + fan = Fan(has_fan_dir, index, index/2, False, self.platform_name) else: - fan = Fan(index, index) + fan = Fan(has_fan_dir, index, index, False, self.platform_name) self._fan_list.append(fan) @@ -230,6 +247,12 @@ class Chassis(ChassisBase): return out.rstrip('\n') + def _get_platform_name(self): + p = subprocess.Popen(GET_PLATFORM_CMD, shell=True, stdout=subprocess.PIPE) + out, err = p.communicate() + return out.rstrip('\n') + + def _get_port_position_tuple_by_sku_name(self): position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]] return position_tuple @@ -442,3 +465,8 @@ class Chassis(ChassisBase): return True, {'sfp':port_dict} else: return True, {'sfp':{}} + + def get_thermal_manager(self): + from .thermal_manager import ThermalManager + return ThermalManager + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py new file mode 100644 index 0000000000..bbf7f36c92 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -0,0 +1,90 @@ +DEVICE_DATA = { + 'x86_64-mlnx_msn2700-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:30":13, "31:40":14 , "41:120":15}, + "unk_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn2740-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":13}, + "unk_untrust": {"-127:15":13, "16:25":14 , "26:30":15, "31:120":17}, + } + } + }, + 'x86_64-mlnx_msn2100-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn2410-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:30":13, "31:40":14 , "41:120":15}, + "unk_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn2010-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":12}, + "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn3700-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'x86_64-mlnx_msn3700c-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:10":12, "11:20":13 , "21:30":14, "31:35":15, "36:120":16}, + } + } + }, + 'x86_64-mlnx_msn3800-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, + 'x86_64-mlnx_msn4700-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } + }, + 'x86_64-mlnx_msn3420-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } + }, + 'x86_64-mlnx_msn4600c-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } + } +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 8b057e4123..d0114dedae 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -9,6 +9,7 @@ ############################################################################# import os.path +import subprocess try: from sonic_platform_base.fan_base import FanBase @@ -22,32 +23,99 @@ PWM_MAX = 255 FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +CONFIG_PATH = "/var/run/hw-management/config" +# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches +FAN_DIR = "/var/run/hw-management/system/fan_dir" +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" + +# Platforms with unplugable FANs: +# 1. don't have fanX_status and should be treated as always present +platform_with_unplugable_fan = ['x86_64-mlnx_msn2010-r0', 'x86_64-mlnx_msn2100-r0'] + class Fan(FanBase): """Platform-specific Fan class""" - def __init__(self, fan_index, drawer_index = 1, psu_fan = False): + + STATUS_LED_COLOR_ORANGE = "orange" + min_cooling_level = 2 + MIN_VALID_COOLING_LEVEL = 1 + MAX_VALID_COOLING_LEVEL = 10 + # PSU fan speed vector + PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', + '0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] + + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, platform = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan - + self.always_presence = False if platform not in platform_with_unplugable_fan else True + self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: self.fan_speed_get_path = "fan{}_speed_get".format(self.index) self.fan_speed_set_path = "fan{}_speed_set".format(self.index) self.fan_presence_path = "fan{}_status".format(self.drawer_index) self.fan_max_speed_path = "fan{}_max".format(self.index) + self._name = "fan{}".format(fan_index + 1) else: self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) - self.fan_max_speed_path = "psu{}_max".format(self.index) + self._name = 'psu_{}_fan_{}'.format(self.index, 1) + self.fan_max_speed_path = None + self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') + self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) self.fan_orange_led_path = "led_fan{}_orange".format(self.drawer_index) self.fan_pwm_path = "pwm1" self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index) + if has_fan_dir: + self.fan_dir = FAN_DIR + else: + self.fan_dir = None + + + def get_direction(self): + """ + Retrieves the fan's direction + + Returns: + A string, either FAN_DIRECTION_INTAKE or FAN_DIRECTION_EXHAUST + depending on fan direction + + Notes: + What Mellanox calls forward: + Air flows from fans side to QSFP side, for example: MSN2700-CS2F + which means intake in community + What Mellanox calls reverse: + Air flow from QSFP side to fans side, for example: MSN2700-CS2R + which means exhaust in community + According to hw-mgmt: + 1 stands for forward, in other words intake + 0 stands for reverse, in other words exhaust + """ + if not self.fan_dir or self.is_psu_fan or not self.get_presence(): + return self.FAN_DIRECTION_NOT_APPLICABLE + + try: + with open(os.path.join(self.fan_dir), 'r') as fan_dir: + fan_dir_bits = int(fan_dir.read().strip()) + fan_mask = 1 << self.drawer_index - 1 + if fan_dir_bits & fan_mask: + return self.FAN_DIRECTION_INTAKE + else: + return self.FAN_DIRECTION_EXHAUST + except (ValueError, IOError) as e: + raise RuntimeError("Failed to read fan direction status to {}".format(repr(e))) + + + def get_name(self): + return self._name def get_status(self): """ @@ -58,15 +126,15 @@ class Fan(FanBase): """ status = 0 if self.is_psu_fan: - status = 1 + status = 0 else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: - status = int(fault_status.read()) + status = int(fault_status.read().strip()) except (ValueError, IOError): - status = 0 + status = 1 - return status == 1 + return status == 0 def get_presence(self): """ @@ -82,11 +150,14 @@ class Fan(FanBase): else: status = 0 else: - try: - with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) - except (ValueError, IOError): - status = 0 + if self.always_presence: + status = 1 + else: + try: + with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: + status = int(presence_status.read().strip()) + except (ValueError, IOError): + status = 0 return status == 1 @@ -104,7 +175,7 @@ class Fan(FanBase): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_max_speed_path), 'r') as max_fan_speed: - speed = int(max_fan_speed.read()) + speed = int(max_fan_speed.read().strip()) except (ValueError, IOError): speed = 0 @@ -120,12 +191,18 @@ class Fan(FanBase): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_speed_get_path), 'r') as fan_curr_speed: - speed_in_rpm = int(fan_curr_speed.read()) + speed_in_rpm = int(fan_curr_speed.read().strip()) except (ValueError, IOError): speed_in_rpm = 0 - + + if self.fan_max_speed_path is None: + # in case of max speed unsupported, we just return speed in unit of RPM. + return speed_in_rpm + max_speed_in_rpm = self._get_max_speed_in_rpm() speed = 100*speed_in_rpm/max_speed_in_rpm + if speed > 100: + speed = 100 return speed @@ -136,14 +213,13 @@ class Fan(FanBase): Returns: int: percentage of the max fan speed """ - speed = 0 - if self.is_psu_fan: # Not like system fan, psu fan speed can not be modified, so target speed is N/A - return speed + return self.get_speed() + try: with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm: - pwm = int(fan_pwm.read()) + pwm = int(fan_pwm.read().strip()) except (ValueError, IOError): pwm = 0 @@ -163,13 +239,36 @@ class Fan(FanBase): bool: True if set success, False if fail. """ status = True - pwm = int(round(PWM_MAX*speed/100.0)) if self.is_psu_fan: - #PSU fan speed is not setable. - return False - + if not self.get_presence(): + return False + from .thermal import logger + try: + with open(self.psu_i2c_bus_path, 'r') as f: + bus = f.read().strip() + with open(self.psu_i2c_addr_path, 'r') as f: + addr = f.read().strip() + with open(self.psu_i2c_command_path, 'r') as f: + command = f.read().strip() + speed = Fan.PSU_FAN_SPEED[int(speed / 10)] + command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed) + subprocess.check_call(command, shell = True) + return True + except subprocess.CalledProcessError as ce: + logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output)) + return False + except Exception as e: + logger.log_error('Failed to set PSU FAN speed - {}'.format(e)) + return False + try: + cooling_level = int(speed / 10) + if cooling_level < self.min_cooling_level: + cooling_level = self.min_cooling_level + speed = self.min_cooling_level * 10 + self.set_cooling_level(cooling_level, cooling_level) + pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: fan_pwm.write(str(pwm)) except (ValueError, IOError): @@ -243,4 +342,43 @@ class Fan(FanBase): considered tolerable """ # The tolerance value is fixed as 20% for all the Mellanox platform - return 20 \ No newline at end of file + return 20 + + @classmethod + def set_cooling_level(cls, level, cur_state): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if not isinstance(level, int): + raise RuntimeError("Failed to set cooling level, input parameter must be integer") + + if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL: + raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format( + cls.MIN_VALID_COOLING_LEVEL, + cls.MAX_VALID_COOLING_LEVEL, + level + )) + + try: + # Reset FAN cooling level vector. According to low level team, + # if we need set cooling level to X, we need first write a (10+X) + # to cooling_cur_state file to reset the cooling level vector. + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(level + 10)) + + # We need set cooling level after resetting the cooling level vector + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(cur_state)) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to set cooling level - {}".format(e)) + + @classmethod + def get_cooling_level(cls): + try: + with open(COOLING_STATE_PATH, 'r') as cooling_state: + cooling_level = int(cooling_state.read().strip()) + return cooling_level + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py b/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py index 25461986f3..6d81ca3e7b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py @@ -24,6 +24,7 @@ class Platform(PlatformBase): self._chassis.initialize_psu() self._chassis.initialize_fan() self._chassis.initialize_eeprom() + self._chassis.initialize_thermals() def _is_host(self): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index f403678a66..eb81fd65a0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -60,6 +60,7 @@ class Psu(PsuBase): psu_oper_status = "thermal/psu{}_pwr_status".format(self.index) #psu_oper_status should always be present for all SKUs self.psu_oper_status = os.path.join(self.psu_path, psu_oper_status) + self._name = "PSU{}".format(psu_index + 1) if sku in hwsku_dict_psu: filemap = psu_profile_list[hwsku_dict_psu[sku]] @@ -90,9 +91,20 @@ class Psu(PsuBase): psu_presence = os.path.join(self.psu_path, psu_presence) self.psu_presence = psu_presence - fan = Fan(psu_index, psu_index, True) - if fan.get_presence(): - self._fan = fan + # unplugable PSU has no FAN + if sku not in hwsku_dict_with_unplugable_psu: + fan = Fan(False, psu_index, psu_index, True) + self._fan_list.append(fan) + + self.psu_green_led_path = "led_psu_green" + self.psu_red_led_path = "led_psu_red" + self.psu_orange_led_path = "led_psu_orange" + self.psu_led_cap_path = "led_psu_capability" + + + def get_name(self): + return self._name + def _read_generic_file(self, filename, len): """ @@ -100,8 +112,10 @@ class Psu(PsuBase): """ result = 0 try: + if not os.path.exists(filename): + return result with open(filename, 'r') as fileobj: - result = int(fileobj.read()) + result = int(fileobj.read().strip()) except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result @@ -169,3 +183,117 @@ class Psu(PsuBase): return float(power) / 1000000 else: return None + + + def _get_led_capability(self): + cap_list = None + try: + with open(os.path.join(LED_PATH, self.psu_led_cap_path), 'r') as psu_led_cap: + caps = psu_led_cap.read() + cap_list = caps.split() + except (ValueError, IOError): + pass + + return cap_list + + + def set_status_led(self, color): + """ + Sets the state of the PSU status LED + + Args: + color: A string representing the color with which to set the + PSU status LED + + Returns: + bool: True if status LED state is set successfully, False if not + + Notes: + Only one led for all PSUs. + """ + led_cap_list = self._get_led_capability() + if led_cap_list is None: + return False + + status = False + try: + if color == self.STATUS_LED_COLOR_GREEN: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'w') as psu_led: + psu_led.write(LED_ON) + status = True + elif color == self.STATUS_LED_COLOR_RED: + # Some fan don't support red led but support orange led, in this case we set led to orange + if self.STATUS_LED_COLOR_RED in led_cap_list: + led_path = os.path.join(LED_PATH, self.psu_red_led_path) + elif self.STATUS_LED_COLOR_ORANGE in led_cap_list: + led_path = os.path.join(LED_PATH, self.psu_orange_led_path) + else: + return False + with open(led_path, 'w') as psu_led: + psu_led.write(LED_ON) + status = True + elif color == self.STATUS_LED_COLOR_OFF: + if self.STATUS_LED_COLOR_GREEN in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + if self.STATUS_LED_COLOR_RED in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_red_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + if self.STATUS_LED_COLOR_ORANGE in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_orange_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + + status = True + else: + status = False + except (ValueError, IOError): + status = False + + return status + + + def get_status_led(self): + """ + Gets the state of the PSU status LED + + Returns: + A string, one of the predefined STATUS_LED_COLOR_* strings above + """ + led_cap_list = self._get_led_capability() + if led_cap_list is None: + return self.STATUS_LED_COLOR_OFF + + try: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_GREEN + if self.STATUS_LED_COLOR_RED in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_red_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_RED + if self.STATUS_LED_COLOR_ORANGE in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_orange_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_RED + except (ValueError, IOError) as e: + raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e))) + + return self.STATUS_LED_COLOR_OFF + + + def get_power_available_status(self): + """ + Gets the power available status + + Returns: + True if power is present and power on. + False and "absence of PSU" if power is not present. + False and "absence of power" if power is present but not power on. + """ + if not self.get_presence(): + return False, "absence of PSU" + elif not self.get_powergood_status(): + return False, "absence of power" + else: + return True, "" + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 1d03016af4..7f462b9c30 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -36,28 +36,46 @@ THERMAL_DEV_BOARD_AMBIENT = "board_amb" THERMAL_API_GET_TEMPERATURE = "get_temperature" THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold" +THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold" + +THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0 HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" +THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" +THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" +THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" +THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_POLICY = "thermal_zone_policy" +THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" +THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" + +MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" + thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", - THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max" + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit" } thermal_api_handler_cpu_pack = { THERMAL_API_GET_TEMPERATURE:"cpu_pack", - THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max" + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit" } thermal_api_handler_module = { THERMAL_API_GET_TEMPERATURE:"module{}_temp_input", - THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit" + THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency" } thermal_api_handler_psu = { THERMAL_API_GET_TEMPERATURE:"psu{}_temp", - THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max" + THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_api_handler_gearbox = { - THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}", - THERMAL_API_GET_HIGH_THRESHOLD:None + THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", + THERMAL_API_GET_HIGH_THRESHOLD:None, + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_ambient_apis = { THERMAL_DEV_ASIC_AMBIENT : "asic", @@ -281,10 +299,12 @@ thermal_profile_list = [ } ] + def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] + Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] @@ -303,15 +323,20 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") + thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status) thermal_list.append(thermal) else: for index in range(count): thermal = Thermal(category, start + index, True) thermal_list.append(thermal) + + class Thermal(ThermalBase): - def __init__(self, category, index, has_index, dependency = None, hint = None): + thermal_profile = None + thermal_algorithm_status = False + + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories """ @@ -328,8 +353,9 @@ class Thermal(ThermalBase): self.category = category self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE) self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) + self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency - self.dependent_hint = hint + def get_name(self): """ @@ -340,18 +366,21 @@ class Thermal(ThermalBase): """ return self.name - def _read_generic_file(self, filename, len): + + @classmethod + def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ result = None try: with open(filename, 'r') as fileobj: - result = fileobj.read() + result = fileobj.read().strip() except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result + def _get_file_from_api(self, api_name): if self.category == THERMAL_DEV_CATEGORY_AMBIENT: if api_name == THERMAL_API_GET_TEMPERATURE: @@ -363,9 +392,13 @@ class Thermal(ThermalBase): if self.category in thermal_device_categories_singleton: filename = handler else: - filename = handler.format(self.index) + if handler: + filename = handler.format(self.index) + else: + return None return join(HW_MGMT_THERMAL_ROOT, filename) + def get_temperature(self): """ Retrieves current temperature reading from thermal @@ -374,19 +407,20 @@ class Thermal(ThermalBase): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None return value_float / 1000.0 + def get_high_threshold(self): """ Retrieves the high threshold temperature of thermal @@ -397,8 +431,167 @@ class Thermal(ThermalBase): """ if self.high_threshold is None: return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None return value_float / 1000.0 + + + def get_high_critical_threshold(self): + """ + Retrieves the high critical threshold temperature of thermal + + Returns: + A float number, the high critical threshold temperature of thermal in Celsius + up to nearest thousandth of one degree Celsius, e.g. 30.125 + """ + if self.high_critical_threshold is None: + return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) + return None + value_str = self._read_generic_file(self.high_critical_threshold, 0) + if value_str is None: + return None + value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None + return value_float / 1000.0 + + + @classmethod + def _write_generic_file(cls, filename, content): + """ + Generic functions to write content to a specified file path if + the content has changed. + """ + try: + with open(filename, 'w+') as file_obj: + origin_content = file_obj.read() + if origin_content != content: + file_obj.write(content) + except Exception as e: + logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) + + @classmethod + def set_thermal_algorithm_status(cls, status, force=True): + """ + Enable/disable kernel thermal algorithm. + When enable kernel thermal algorithm, kernel will adjust fan speed + according to thermal zones temperature. Please note that kernel will + only adjust fan speed when temperature across some "edge", e.g temperature + changes to exceed high threshold. + When disable kernel thermal algorithm, kernel no longer adjust fan speed. + We usually disable the algorithm when we want to set a fix speed. E.g, when + a fan unit is removed from system, we will set fan speed to 100% and disable + the algorithm to avoid it adjust the speed. + + Returns: + True if thermal algorithm status changed. + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not force and cls.thermal_algorithm_status == status: + return False + + cls.thermal_algorithm_status = status + content = "enabled" if status else "disabled" + policy = "step_wise" if status else "user_space" + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + return True + + @classmethod + def check_thermal_zone_temperature(cls): + """ + Check thermal zone current temperature with normal temperature + + Returns: + True if all thermal zones current temperature less or equal than normal temperature + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): + return False + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): + return False + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): + return False + + return True + + @classmethod + def _check_thermal_zone_temperature(cls, thermal_zone_path): + normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) + current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) + normal = None + current = None + try: + with open(normal_temp_path, 'r') as file_obj: + normal = float(file_obj.read()) + + with open(current_temp_path, 'r') as file_obj: + current = float(file_obj.read()) + + return current <= normal + except Exception as e: + logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + + @classmethod + def check_module_temperature_trustable(cls): + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + for index in range(count): + fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) + fault = cls._read_generic_file(fault_file_path, 0) + if fault.strip() != '0': + return 'untrust' + return 'trust' + + @classmethod + def get_min_amb_temperature(cls): + fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) + port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) + + # if there is any exception, let it raise + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) + return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py new file mode 100644 index 0000000000..e7436bd0a5 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -0,0 +1,209 @@ +from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object +from .thermal import logger + + +class SetFanSpeedAction(ThermalPolicyActionBase): + """ + Base thermal action class to set speed for fans + """ + # JSON field definition + JSON_FIELD_SPEED = 'speed' + + def __init__(self): + """ + Constructor of SetFanSpeedAction which actually do nothing. + """ + self.speed = None + + def load_from_json(self, json_obj): + """ + Construct SetFanSpeedAction via JSON. JSON example: + { + "type": "fan.all.set_speed" + "speed": "100" + } + :param json_obj: A JSON object representing a SetFanSpeedAction action. + :return: + """ + if SetFanSpeedAction.JSON_FIELD_SPEED in json_obj: + speed = float(json_obj[SetFanSpeedAction.JSON_FIELD_SPEED]) + if speed < 0 or speed > 100: + raise ValueError('SetFanSpeedAction invalid speed value {} in JSON policy file, valid value should be [0, 100]'. + format(speed)) + self.speed = float(json_obj[SetFanSpeedAction.JSON_FIELD_SPEED]) + else: + raise ValueError('SetFanSpeedAction missing mandatory field {} in JSON policy file'. + format(SetFanSpeedAction.JSON_FIELD_SPEED)) + + +@thermal_json_object('fan.all.set_speed') +class SetAllFanSpeedAction(SetFanSpeedAction): + """ + Action to set speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal_infos import FanInfo + if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + for fan in fan_info_obj.get_presence_fans(): + fan.set_speed(self.speed) + logger.log_info('Set all system FAN speed to {}'.format(self.speed)) + + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) + + @classmethod + def set_psu_fan_speed(cls, thermal_info_dict, speed): + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + + +@thermal_json_object('fan.all.check_and_set_speed') +class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): + """ + Action to check thermal zone temperature and recover speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Check thermal zone and set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal import Thermal + if Thermal.check_thermal_zone_temperature(): + SetAllFanSpeedAction.execute(self, thermal_info_dict) + + +@thermal_json_object('thermal_control.control') +class ControlThermalAlgoAction(ThermalPolicyActionBase): + """ + Action to control the thermal control algorithm + """ + # JSON field definition + JSON_FIELD_STATUS = 'status' + + def __init__(self): + self.status = True + + def load_from_json(self, json_obj): + """ + Construct ControlThermalAlgoAction via JSON. JSON example: + { + "type": "thermal_control.control" + "status": "true" + } + :param json_obj: A JSON object representing a ControlThermalAlgoAction action. + :return: + """ + if ControlThermalAlgoAction.JSON_FIELD_STATUS in json_obj: + status_str = json_obj[ControlThermalAlgoAction.JSON_FIELD_STATUS].lower() + if status_str == 'true': + self.status = True + elif status_str == 'false': + self.status = False + else: + raise ValueError('Invalid {} field value, please specify true of false'. + format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) + else: + raise ValueError('ControlThermalAlgoAction ' + 'missing mandatory field {} in JSON policy file'. + format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) + + def execute(self, thermal_info_dict): + """ + Disable thermal control algorithm + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal_infos import FanInfo + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .fan import Fan + status_changed = Thermal.set_thermal_algorithm_status(self.status, False) + + # Only update cooling level if thermal algorithm status changed + if status_changed: + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # back to normal, set it to minimum allowed speed to + # save power + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + + +@thermal_json_object('thermal.recover') +class ThermalRecoverAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + +class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): + UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): + from .device_data import DEVICE_DATA + from .fan import Fan + from .thermal_infos import ChassisInfo + from .thermal_conditions import MinCoolingLevelChangeCondition + from .thermal_conditions import UpdateCoolingLevelToMinCondition + + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + if chassis.platform_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.platform_name] or 'minimum_table' not in DEVICE_DATA[chassis.platform_name]['thermal']: + Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + else: + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['unk_{}'.format(trust_state)] + + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0].strip()) + temp_max = int(temp_range[1].strip()) + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break + + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level < Fan.min_cooling_level: + Fan.set_cooling_level(Fan.min_cooling_level, Fan.min_cooling_level) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) + else: + Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + +class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + from .thermal_conditions import CoolingLevelChangeCondition + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) + + +class UpdateCoolingLevelToMinAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + self.update_cooling_level_to_minimum(thermal_info_dict) + + @classmethod + def update_cooling_level_to_minimum(cls, thermal_info_dict): + from .fan import Fan + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .thermal_infos import FanInfo + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + speed = Fan.min_cooling_level * 10 + for fan in fan_info_obj.get_presence_fans(): + fan.set_speed(speed) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) + UpdateCoolingLevelToMinCondition.enable = False + else: + UpdateCoolingLevelToMinCondition.enable = True + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py new file mode 100644 index 0000000000..94e18a2e00 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -0,0 +1,126 @@ +from sonic_platform_base.sonic_thermal_control.thermal_condition_base import ThermalPolicyConditionBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object + + +class FanCondition(ThermalPolicyConditionBase): + def get_fan_info(self, thermal_info_dict): + from .thermal_infos import FanInfo + if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo): + return thermal_info_dict[FanInfo.INFO_NAME] + else: + return None + + +@thermal_json_object('fan.any.absence') +class AnyFanAbsenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_absence_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.absence') +class AllFanAbsenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_presence_fans()) == 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.presence') +class AllFanPresenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False + + +@thermal_json_object('fan.any.fault') +class AnyFanFaultCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.good') +class AllFanGoodCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) == 0 if fan_info_obj else False + + +class PsuCondition(ThermalPolicyConditionBase): + def get_psu_info(self, thermal_info_dict): + from .thermal_infos import PsuInfo + if PsuInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[PsuInfo.INFO_NAME], PsuInfo): + return thermal_info_dict[PsuInfo.INFO_NAME] + else: + return None + + +@thermal_json_object('psu.any.absence') +class AnyPsuAbsenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_absence_psus()) > 0 if psu_info_obj else False + + +@thermal_json_object('psu.all.absence') +class AllPsuAbsenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_presence_psus()) == 0 if psu_info_obj else False + + +@thermal_json_object('psu.all.presence') +class AllPsuPresenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + + +class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): + trust_state = None + temperature = None + + def is_match(self, thermal_info_dict): + from .thermal import Thermal + + trust_state = Thermal.check_module_temperature_trustable() + temperature = Thermal.get_min_amb_temperature() + temperature = temperature / 1000 + + change_cooling_level = False + if trust_state != MinCoolingLevelChangeCondition.trust_state: + MinCoolingLevelChangeCondition.trust_state = trust_state + change_cooling_level = True + + if temperature != MinCoolingLevelChangeCondition.temperature: + MinCoolingLevelChangeCondition.temperature = temperature + change_cooling_level = True + + return change_cooling_level + + +class CoolingLevelChangeCondition(ThermalPolicyConditionBase): + cooling_level = None + + def is_match(self, thermal_info_dict): + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level != CoolingLevelChangeCondition.cooling_level: + CoolingLevelChangeCondition.cooling_level = current_cooling_level + return True + else: + return False + + +class UpdateCoolingLevelToMinCondition(ThermalPolicyConditionBase): + enable = False + def is_match(self, thermal_info_dict): + if not UpdateCoolingLevelToMinCondition.enable: + return False + + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level == Fan.min_cooling_level: + UpdateCoolingLevelToMinCondition.enable = False + return False + return True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py new file mode 100644 index 0000000000..e810a56464 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -0,0 +1,154 @@ +from sonic_platform_base.sonic_thermal_control.thermal_info_base import ThermalPolicyInfoBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object + + +@thermal_json_object('fan_info') +class FanInfo(ThermalPolicyInfoBase): + """ + Fan information needed by thermal policy + """ + + # Fan information name + INFO_NAME = 'fan_info' + + def __init__(self): + self._absence_fans = set() + self._presence_fans = set() + self._fault_fans = set() + self._status_changed = False + + def collect(self, chassis): + """ + Collect absence and presence fans. + :param chassis: The chassis object + :return: + """ + self._status_changed = False + for fan in chassis.get_all_fans(): + presence = fan.get_presence() + status = fan.get_status() + if presence and fan not in self._presence_fans: + self._presence_fans.add(fan) + self._status_changed = True + if fan in self._absence_fans: + self._absence_fans.remove(fan) + elif not presence and fan not in self._absence_fans: + self._absence_fans.add(fan) + self._status_changed = True + if fan in self._presence_fans: + self._presence_fans.remove(fan) + + if not status and fan not in self._fault_fans: + self._fault_fans.add(fan) + self._status_changed = True + elif status and fan in self._fault_fans: + self._fault_fans.remove(fan) + self._status_changed = True + + + def get_absence_fans(self): + """ + Retrieves absence fans + :return: A set of absence fans + """ + return self._absence_fans + + def get_presence_fans(self): + """ + Retrieves presence fans + :return: A set of presence fans + """ + return self._presence_fans + + def get_fault_fans(self): + """ + Retrieves fault fans + :return: A set of fault fans + """ + return self._fault_fans + + def is_status_changed(self): + """ + Retrieves if the status of fan information changed + :return: True if status changed else False + """ + return self._status_changed + + +@thermal_json_object('psu_info') +class PsuInfo(ThermalPolicyInfoBase): + """ + PSU information needed by thermal policy + """ + INFO_NAME = 'psu_info' + + def __init__(self): + self._absence_psus = set() + self._presence_psus = set() + self._status_changed = False + + def collect(self, chassis): + """ + Collect absence and presence PSUs. + :param chassis: The chassis object + :return: + """ + self._status_changed = False + for psu in chassis.get_all_psus(): + if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus: + self._presence_psus.add(psu) + self._status_changed = True + if psu in self._absence_psus: + self._absence_psus.remove(psu) + elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus: + self._absence_psus.add(psu) + self._status_changed = True + if psu in self._presence_psus: + self._presence_psus.remove(psu) + + def get_absence_psus(self): + """ + Retrieves presence PSUs + :return: A set of absence PSUs + """ + return self._absence_psus + + def get_presence_psus(self): + """ + Retrieves presence PSUs + :return: A set of presence fans + """ + return self._presence_psus + + def is_status_changed(self): + """ + Retrieves if the status of PSU information changed + :return: True if status changed else False + """ + return self._status_changed + + +@thermal_json_object('chassis_info') +class ChassisInfo(ThermalPolicyInfoBase): + """ + Chassis information needed by thermal policy + """ + INFO_NAME = 'chassis_info' + + def __init__(self): + self._chassis = None + + def collect(self, chassis): + """ + Collect platform chassis. + :param chassis: The chassis object + :return: + """ + self._chassis = chassis + + def get_chassis(self): + """ + Retrieves platform chassis object + :return: A platform chassis object. + """ + return self._chassis diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py new file mode 100644 index 0000000000..c0eae332e4 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -0,0 +1,64 @@ +from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy +from .thermal_actions import * # lgtm [py/polluting-import] +from .thermal_conditions import * # lgtm [py/polluting-import] +from .thermal_infos import * # lgtm [py/polluting-import] + + +class ThermalManager(ThermalManagerBase): + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + cls._add_private_thermal_policy() + + @classmethod + def deinitialize(cls): + """ + Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function + is a no-op. + :return: + """ + cls.start_thermal_control_algorithm() + + @classmethod + def start_thermal_control_algorithm(cls): + """ + Start thermal control algorithm + + Returns: + bool: True if set success, False if fail. + """ + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(True) + + @classmethod + def stop_thermal_control_algorithm(cls): + """ + Stop thermal control algorithm + + Returns: + bool: True if set success, False if fail. + """ + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(False) + + @classmethod + def _add_private_thermal_policy(cls): + dynamic_min_speed_policy = ThermalPolicy() + dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy + + update_psu_fan_speed_policy = ThermalPolicy() + update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() + update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() + cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy + + update_cooling_level_policy = ThermalPolicy() + update_cooling_level_policy.conditions[UpdateCoolingLevelToMinCondition] = UpdateCoolingLevelToMinCondition() + update_cooling_level_policy.actions[UpdateCoolingLevelToMinAction] = UpdateCoolingLevelToMinAction() + cls._policy_dict['UpdateCoolingLevelPolicy'] = update_cooling_level_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/__init__.py b/platform/mellanox/mlnx-platform-api/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json new file mode 100644 index 0000000000..c19787aa26 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json @@ -0,0 +1,18 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json new file mode 100644 index 0000000000..c25d84762e --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json @@ -0,0 +1,17 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + }, + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_action.json b/platform/mellanox/mlnx-platform-api/tests/empty_action.json new file mode 100644 index 0000000000..b1051b5a6f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_action.json @@ -0,0 +1,10 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_condition.json b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json new file mode 100644 index 0000000000..e7a5884592 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json @@ -0,0 +1,11 @@ +{ + "name": "any fan absence", + "conditions": [ + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py new file mode 100644 index 0000000000..c534805848 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -0,0 +1,58 @@ +class MockFan: + speed = 60 + def __init__(self): + self.presence = True + self.status = True + + def get_presence(self): + return self.presence + + def set_speed(self, speed): + MockFan.speed = speed + + def get_status(self): + return self.status + + def get_target_speed(self): + return MockFan.speed + + +class MockPsu: + def __init__(self): + self.presence = True + self.powergood = True + + def get_presence(self): + return self.presence + + def get_powergood_status(self): + return self.powergood + + def get_all_fans(self): + return [] + + +class MockChassis: + def __init__(self): + self.fan_list = [] + self.psu_list = [] + + def get_all_psus(self): + return self.psu_list + + def get_all_fans(self): + return self.fan_list + + def get_thermal_manager(self): + from sonic_platform.thermal_manager import ThermalManager + return ThermalManager + + def make_fan_absence(self): + fan = MockFan() + fan.presence = False + self.fan_list.append(fan) + + def make_psu_absence(self): + psu = MockPsu() + psu.presence = False + self.psu_list.append(psu) diff --git a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json new file mode 100644 index 0000000000..ace291be1c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json @@ -0,0 +1,75 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence 1", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py new file mode 100644 index 0000000000..381260163c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py @@ -0,0 +1,17 @@ +import os +import sys +from mock import MagicMock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.fan import Fan + + +def test_get_absence_fan_direction(): + fan = Fan(True, 0, 0) + fan.get_presence = MagicMock(return_value=False) + assert fan.fan_dir is not None + assert not fan.is_psu_fan + assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py new file mode 100644 index 0000000000..f25b6421ed --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -0,0 +1,496 @@ +import os +import sys +import pytest +import json +from mock import MagicMock +from .mock_platform import MockChassis, MockFan, MockPsu + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.thermal_manager import ThermalManager +from sonic_platform.thermal_infos import FanInfo, PsuInfo +from sonic_platform.thermal import Thermal + +Thermal.check_thermal_zone_temperature = MagicMock() +Thermal.set_thermal_algorithm_status = MagicMock() + + +@pytest.fixture(scope='session', autouse=True) +def thermal_manager(): + policy_file = os.path.join(test_path, 'thermal_policy.json') + ThermalManager.load(policy_file) + return ThermalManager + + +def test_load_policy(thermal_manager): + assert 'psu_info' in thermal_manager._thermal_info_dict + assert 'fan_info' in thermal_manager._thermal_info_dict + assert 'chassis_info' in thermal_manager._thermal_info_dict + + assert 'any fan absence' in thermal_manager._policy_dict + assert 'any psu absence' in thermal_manager._policy_dict + assert 'any fan broken' in thermal_manager._policy_dict + assert 'all fan and psu presence' in thermal_manager._policy_dict + + assert thermal_manager._fan_speed_when_suspend == 60 + assert thermal_manager._run_thermal_algorithm_at_boot_up == False + + +def test_fan_info(): + chassis = MockChassis() + chassis.make_fan_absence() + fan_info = FanInfo() + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 1 + assert len(fan_info.get_presence_fans()) == 0 + assert len(fan_info.get_fault_fans()) == 0 + assert fan_info.is_status_changed() + + fan_list = chassis.get_all_fans() + fan_list[0].presence = True + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 0 + assert fan_info.is_status_changed() + + fan_list[0].status = False + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 1 + assert fan_info.is_status_changed() + +def test_psu_info(): + chassis = MockChassis() + chassis.make_psu_absence() + psu_info = PsuInfo() + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + + psu_list = chassis.get_all_psus() + psu_list[0].presence = True + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 0 + assert len(psu_info.get_presence_psus()) == 1 + assert psu_info.is_status_changed() + + psu_list[0].powergood = False + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + + +def test_fan_policy(thermal_manager): + chassis = MockChassis() + chassis.make_fan_absence() + chassis.fan_list.append(MockFan()) + thermal_manager.run_policy(chassis) + + fan_list = chassis.get_all_fans() + assert fan_list[1].speed == 100 + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + fan_list[0].presence = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 60 + assert fan_list[1].speed == 60 + + fan_list[0].status = False + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + fan_list[0].status = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + + +def test_psu_policy(thermal_manager): + chassis = MockChassis() + chassis.make_psu_absence() + chassis.fan_list.append(MockFan()) + thermal_manager.run_policy(chassis) + + fan_list = chassis.get_all_fans() + assert fan_list[0].speed == 100 + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + psu_list = chassis.get_all_psus() + psu_list[0].presence = True + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + + +def test_any_fan_absence_condition(): + chassis = MockChassis() + chassis.make_fan_absence() + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanAbsenceCondition + condition = AnyFanAbsenceCondition() + assert condition.is_match({'fan_info': fan_info}) + + fan = chassis.get_all_fans()[0] + fan.presence = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + + +def test_all_fan_absence_condition(): + chassis = MockChassis() + chassis.make_fan_absence() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanAbsenceCondition + condition = AllFanAbsenceCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fan.presence = False + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + + +def test_any_fan_fault_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanFaultCondition + condition = AnyFanFaultCondition() + assert condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + +def test_all_fan_good_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanGoodCondition + condition = AllFanGoodCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + + +def test_any_psu_absence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyPsuAbsenceCondition + condition = AnyPsuAbsenceCondition() + assert condition.is_match({'psu_info': psu_info}) + + psu = chassis.get_all_psus()[0] + psu.presence = True + psu_info.collect(chassis) + assert not condition.is_match({'psu_info': psu_info}) + + +def test_all_psu_absence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu = MockPsu() + psu_list = chassis.get_all_psus() + psu_list.append(psu) + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllPsuAbsenceCondition + condition = AllPsuAbsenceCondition() + assert not condition.is_match({'psu_info': psu_info}) + + psu.presence = False + psu_info.collect(chassis) + assert condition.is_match({'psu_info': psu_info}) + + +def test_all_fan_presence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu = MockPsu() + psu_list = chassis.get_all_psus() + psu_list.append(psu) + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllPsuPresenceCondition + condition = AllPsuPresenceCondition() + assert not condition.is_match({'psu_info': psu_info}) + + psu_list[0].presence = True + psu_info.collect(chassis) + assert condition.is_match({'psu_info': psu_info}) + + +def test_load_set_fan_speed_action(): + from sonic_platform.thermal_actions import SetAllFanSpeedAction + action = SetAllFanSpeedAction() + json_str = '{\"speed\": \"50\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 50 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + +def test_execute_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_actions import SetAllFanSpeedAction + action = SetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + +def test_load_control_thermal_algo_action(): + from sonic_platform.thermal_actions import ControlThermalAlgoAction + action = ControlThermalAlgoAction() + json_str = '{\"status\": \"false\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert not action.status + + json_str = '{\"status\": \"true\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.status + + json_str = '{\"status\": \"invalid\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"true\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_load_check_and_set_speed_action(): + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + json_str = '{\"speed\": \"40\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 40 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"60\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_execute_check_and_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + fan_list[0].speed = 100 + fan_list[1].speed = 100 + action.speed = 60 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + +def test_load_duplicate_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_duplicate_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_policy_with_same_conditions(): + from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase + class MockThermalManager(ThermalManagerBase): + pass + + with pytest.raises(Exception): + MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) + +def test_dynamic_minimum_table_data(): + from sonic_platform.device_data import DEVICE_DATA + for platform, platform_data in DEVICE_DATA.items(): + if 'thermal' in platform_data and 'minimum_table' in platform_data['thermal']: + minimum_table = platform_data['thermal']['minimum_table'] + check_minimum_table_data(platform, minimum_table) + +def check_minimum_table_data(platform, minimum_table): + valid_dir = ['p2c', 'c2p', 'unk'] + valid_trust_state = ['trust', 'untrust'] + + for category, data in minimum_table.items(): + key_data = category.split('_') + assert key_data[0] in valid_dir + assert key_data[1] in valid_trust_state + + data_list = [(value, key) for key, value in data.items()] + data_list.sort(key=lambda x : x[0]) + + previous_edge = None + previous_cooling_level = None + for item in data_list: + cooling_level = item[0] + range_str = item[1] + + ranges = range_str.split(':') + low = int(ranges[0]) + high = int(ranges[1]) + assert low < high + + if previous_edge is None: + assert low == -127 + else: + assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(platform, key_data[0], key_data[1], item) + previous_edge = high + + assert 10 <= cooling_level <= 20 + if previous_cooling_level is not None: + assert cooling_level > previous_cooling_level + previous_cooling_level = cooling_level + +def test_dynamic_minimum_policy(thermal_manager): + from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition + from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal import Thermal + from sonic_platform.fan import Fan + ThermalManager.initialize() + assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict + policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] + assert MinCoolingLevelChangeCondition in policy.conditions + assert ChangeMinCoolingLevelAction in policy.actions + + condition = policy.conditions[MinCoolingLevelChangeCondition] + action = policy.actions[ChangeMinCoolingLevelAction] + Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') + Thermal.get_min_amb_temperature = MagicMock(return_value=35000) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'trust' + assert MinCoolingLevelChangeCondition.temperature == 35 + assert not condition.is_match(None) + + Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'untrust' + + Thermal.get_min_amb_temperature = MagicMock(return_value=25000) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.temperature == 25 + + chassis = MockChassis() + chassis.platform_name = 'invalid' + info = ChassisInfo() + info._chassis = chassis + thermal_info_dict = {ChassisInfo.INFO_NAME: info} + Fan.get_cooling_level = MagicMock(return_value=5) + Fan.set_cooling_level = MagicMock() + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 6 + Fan.set_cooling_level.assert_called_with(6, 6) + Fan.set_cooling_level.call_count = 0 + + chassis.platform_name = 'x86_64-mlnx_msn2700-r0' + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 3 + Fan.set_cooling_level.assert_called_with(3, 5) diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json new file mode 100644 index 0000000000..413211b212 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -0,0 +1,97 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + }, + { + "type": "fan.all.good" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" + } + ] + } + ] +} \ No newline at end of file diff --git a/rules/docker-platform-monitor.mk b/rules/docker-platform-monitor.mk index a37f4d2e9e..db1c8c5a02 100644 --- a/rules/docker-platform-monitor.mk +++ b/rules/docker-platform-monitor.mk @@ -10,7 +10,7 @@ $(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(LIBSENSORS) $(LM_SENSORS) $(FANCONTROL) ifeq ($(CONFIGURED_PLATFORM),barefoot) $(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(PYTHON_THRIFT) endif -$(DOCKER_PLATFORM_MONITOR)_PYTHON_DEBS += $(SONIC_LEDD) $(SONIC_XCVRD) $(SONIC_PSUD) $(SONIC_SYSEEPROMD) +$(DOCKER_PLATFORM_MONITOR)_PYTHON_DEBS += $(SONIC_LEDD) $(SONIC_XCVRD) $(SONIC_PSUD) $(SONIC_SYSEEPROMD) $(SONIC_THERMALCTLD) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SONIC_PLATFORM_COMMON_PY2) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SWSSSDK_PY2) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SONIC_PLATFORM_API_PY2) diff --git a/rules/sonic-thermalctld.mk b/rules/sonic-thermalctld.mk new file mode 100644 index 0000000000..775082e7bb --- /dev/null +++ b/rules/sonic-thermalctld.mk @@ -0,0 +1,6 @@ +# sonic-thermalctld (SONiC Thermal control daemon) Debian package + +SONIC_THERMALCTLD = python-sonic-thermalctld_1.0-1_all.deb +$(SONIC_THERMALCTLD)_SRC_PATH = $(SRC_PATH)/sonic-platform-daemons/sonic-thermalctld +$(SONIC_THERMALCTLD)_WHEEL_DEPENDS = $(SONIC_DAEMON_BASE_PY2) +SONIC_PYTHON_STDEB_DEBS += $(SONIC_THERMALCTLD) diff --git a/src/sonic-daemon-base/sonic_daemon_base/task_base.py b/src/sonic-daemon-base/sonic_daemon_base/task_base.py new file mode 100644 index 0000000000..e1738ffba2 --- /dev/null +++ b/src/sonic-daemon-base/sonic_daemon_base/task_base.py @@ -0,0 +1,50 @@ +import multiprocessing +import os +import signal +import threading + + +# +# ProcessTaskBase ===================================================================== +# +class ProcessTaskBase(object): # TODO: put this class to swss-platform-common + def __init__(self): + self.task_process = None + self.task_stopping_event = multiprocessing.Event() + + def task_worker(self): + pass + + def task_run(self): + if self.task_stopping_event.is_set(): + return + + self.task_process = multiprocessing.Process(target=self.task_worker) + self.task_process.start() + + def task_stop(self): + self.task_stopping_event.set() + os.kill(self.task_process.pid, signal.SIGKILL) + + +# +# ThreadTaskBase ===================================================================== +# +class ThreadTaskBase(object): # TODO: put this class to swss-platform-common; + def __init__(self): + self.task_thread = None + self.task_stopping_event = threading.Event() + + def task_worker(self): + pass + + def task_run(self): + if self.task_stopping_event.is_set(): + return + + self.task_thread = threading.Thread(target=self.task_worker) + self.task_thread.start() + + def task_stop(self): + self.task_stopping_event.set() + self.task_thread.join() diff --git a/src/sonic-platform-common b/src/sonic-platform-common index ee60f546d8..9036e15dff 160000 --- a/src/sonic-platform-common +++ b/src/sonic-platform-common @@ -1 +1 @@ -Subproject commit ee60f546d8740418ec2bd2ca922cc3be5fdfd0ac +Subproject commit 9036e15dffe9b6581e4c724726abbea8446f9993 diff --git a/src/sonic-utilities b/src/sonic-utilities index 6a0a3bedb5..40e7452d30 160000 --- a/src/sonic-utilities +++ b/src/sonic-utilities @@ -1 +1 @@ -Subproject commit 6a0a3bedb57d04eb4dd2f7494aba37e3477674b5 +Subproject commit 40e7452d300758341d31f4afee59f2de2eb4dc47