2ffc9d572f
- Why I did it Optimize thermal control policies to simplify the logic and add more protection code in policies to make sure it works even if kernel algorithm does not work. - How I did it Reduce unused thermal policies Add timely ASIC temperature check in thermal policy to make sure ASIC temperature and fan speed is coordinated Minimum allowed fan speed now is calculated by max of the expected fan speed among all policies Move some logic from fan.py to thermal.py to make it more readable - How to verify it 1. Manual test 2. Regression
688 lines
26 KiB
Python
688 lines
26 KiB
Python
#!/usr/bin/env python
|
|
|
|
#############################################################################
|
|
# Mellanox
|
|
#
|
|
# Module contains an implementation of SONiC Platform Base API and
|
|
# provides the thermals data which are available in the platform
|
|
#
|
|
#############################################################################
|
|
|
|
try:
|
|
from sonic_platform_base.thermal_base import ThermalBase
|
|
from sonic_py_common.logger import Logger
|
|
from os import listdir
|
|
from os.path import isfile, join
|
|
import io
|
|
import os.path
|
|
import glob
|
|
|
|
from . import utils
|
|
except ImportError as e:
|
|
raise ImportError (str(e) + "- required module not found")
|
|
|
|
# Global logger class instance
|
|
logger = Logger()
|
|
|
|
THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core"
|
|
THERMAL_DEV_CATEGORY_CPU_PACK = "cpu_pack"
|
|
THERMAL_DEV_CATEGORY_MODULE = "module"
|
|
THERMAL_DEV_CATEGORY_PSU = "psu"
|
|
THERMAL_DEV_CATEGORY_GEARBOX = "gearbox"
|
|
THERMAL_DEV_CATEGORY_AMBIENT = "ambient"
|
|
|
|
THERMAL_DEV_ASIC_AMBIENT = "asic_amb"
|
|
THERMAL_DEV_FAN_AMBIENT = "fan_amb"
|
|
THERMAL_DEV_PORT_AMBIENT = "port_amb"
|
|
THERMAL_DEV_COMEX_AMBIENT = "comex_amb"
|
|
THERMAL_DEV_BOARD_AMBIENT = "board_amb"
|
|
|
|
THERMAL_API_GET_TEMPERATURE = "get_temperature"
|
|
THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold"
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold"
|
|
|
|
THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0
|
|
|
|
HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/"
|
|
|
|
THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/"
|
|
THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/"
|
|
THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/"
|
|
THERMAL_ZONE_MODE = "thermal_zone_mode"
|
|
THERMAL_ZONE_POLICY = "thermal_zone_policy"
|
|
THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp"
|
|
THERMAL_ZONE_HOT_THRESHOLD = "temp_trip_hot"
|
|
THERMAL_ZONE_HIGH_THRESHOLD = "temp_trip_high"
|
|
THERMAL_ZONE_NORMAL_THRESHOLD = "temp_trip_norm"
|
|
THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*'
|
|
THERMAL_ZONE_HYSTERESIS = 5000
|
|
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"
|
|
# Min allowed cooling level when all thermal zones are in normal state
|
|
MIN_COOLING_LEVEL_FOR_NORMAL = 2
|
|
# Min allowed cooling level when any thermal zone is in high state but no thermal zone is in emergency state
|
|
MIN_COOLING_LEVEL_FOR_HIGH = 4
|
|
MAX_COOLING_LEVEL = 10
|
|
|
|
MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault"
|
|
|
|
thermal_api_handler_cpu_core = {
|
|
THERMAL_API_GET_TEMPERATURE:"cpu_core{}",
|
|
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max",
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit"
|
|
}
|
|
thermal_api_handler_cpu_pack = {
|
|
THERMAL_API_GET_TEMPERATURE:"cpu_pack",
|
|
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max",
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit"
|
|
}
|
|
thermal_api_handler_module = {
|
|
THERMAL_API_GET_TEMPERATURE:"module{}_temp_input",
|
|
THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit",
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency"
|
|
}
|
|
thermal_api_handler_psu = {
|
|
THERMAL_API_GET_TEMPERATURE:"psu{}_temp",
|
|
THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max",
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
|
|
}
|
|
thermal_api_handler_gearbox = {
|
|
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
|
|
THERMAL_API_GET_HIGH_THRESHOLD:None,
|
|
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
|
|
}
|
|
thermal_ambient_apis = {
|
|
THERMAL_DEV_ASIC_AMBIENT : "asic",
|
|
THERMAL_DEV_PORT_AMBIENT : "port_amb",
|
|
THERMAL_DEV_FAN_AMBIENT : "fan_amb",
|
|
THERMAL_DEV_COMEX_AMBIENT : "comex_amb",
|
|
THERMAL_DEV_BOARD_AMBIENT : "board_amb"
|
|
}
|
|
thermal_ambient_name = {
|
|
THERMAL_DEV_ASIC_AMBIENT : "Ambient ASIC Temp",
|
|
THERMAL_DEV_PORT_AMBIENT : "Ambient Port Side Temp",
|
|
THERMAL_DEV_FAN_AMBIENT : "Ambient Fan Side Temp",
|
|
THERMAL_DEV_COMEX_AMBIENT : "Ambient COMEX Temp",
|
|
THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp"
|
|
}
|
|
thermal_api_handlers = {
|
|
THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core,
|
|
THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack,
|
|
THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module,
|
|
THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu,
|
|
THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox
|
|
}
|
|
thermal_name = {
|
|
THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp",
|
|
THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp",
|
|
THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp",
|
|
THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp",
|
|
THERMAL_DEV_CATEGORY_GEARBOX : "Gearbox {} Temp"
|
|
}
|
|
|
|
thermal_device_categories_all = [
|
|
THERMAL_DEV_CATEGORY_CPU_CORE,
|
|
THERMAL_DEV_CATEGORY_CPU_PACK,
|
|
THERMAL_DEV_CATEGORY_MODULE,
|
|
THERMAL_DEV_CATEGORY_PSU,
|
|
THERMAL_DEV_CATEGORY_AMBIENT,
|
|
THERMAL_DEV_CATEGORY_GEARBOX
|
|
]
|
|
|
|
thermal_device_categories_singleton = [
|
|
THERMAL_DEV_CATEGORY_CPU_PACK,
|
|
THERMAL_DEV_CATEGORY_AMBIENT
|
|
]
|
|
thermal_api_names = [
|
|
THERMAL_API_GET_TEMPERATURE,
|
|
THERMAL_API_GET_HIGH_THRESHOLD
|
|
]
|
|
|
|
platform_dict_thermal = {'x86_64-mlnx_msn2700-r0': 0, 'x86_64-mlnx_lssn2700-r0':0, 'x86_64-mlnx_msn2740-r0': 3, 'x86_64-mlnx_msn2100-r0': 1, 'x86_64-mlnx_msn2410-r0': 2, 'x86_64-mlnx_msn2010-r0': 4, 'x86_64-mlnx_msn3420-r0':9, 'x86_64-mlnx_msn3700-r0': 5, 'x86_64-mlnx_msn3700c-r0': 6, 'x86_64-mlnx_msn3800-r0': 7, 'x86_64-mlnx_msn4600c-r0':9, 'x86_64-mlnx_msn4700-r0': 8}
|
|
thermal_profile_list = [
|
|
# 2700
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 2100
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 16),
|
|
THERMAL_DEV_CATEGORY_PSU:(0, 0),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT,
|
|
]
|
|
)
|
|
},
|
|
# 2410
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 56),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT,
|
|
]
|
|
)
|
|
},
|
|
# 2740
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT,
|
|
]
|
|
)
|
|
},
|
|
# 2010
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 22),
|
|
THERMAL_DEV_CATEGORY_PSU:(0, 0),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT,
|
|
]
|
|
)
|
|
},
|
|
# 3700
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 3700c
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 3800
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(1,32),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 4700
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 3420
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 60),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
},
|
|
# 4600C
|
|
{
|
|
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
|
|
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
|
|
THERMAL_DEV_CATEGORY_PSU:(1, 2),
|
|
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
|
|
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
|
|
THERMAL_DEV_CATEGORY_AMBIENT:(0,
|
|
[
|
|
THERMAL_DEV_ASIC_AMBIENT,
|
|
THERMAL_DEV_COMEX_AMBIENT,
|
|
THERMAL_DEV_PORT_AMBIENT,
|
|
THERMAL_DEV_FAN_AMBIENT
|
|
]
|
|
)
|
|
}
|
|
]
|
|
|
|
|
|
def initialize_thermals(platform, thermal_list, psu_list):
|
|
# create thermal objects for all categories of sensors
|
|
tp_index = platform_dict_thermal[platform]
|
|
thermal_profile = thermal_profile_list[tp_index]
|
|
Thermal.thermal_profile = thermal_profile
|
|
for category in thermal_device_categories_all:
|
|
if category == THERMAL_DEV_CATEGORY_AMBIENT:
|
|
count, ambient_list = thermal_profile[category]
|
|
for ambient in ambient_list:
|
|
thermal = Thermal(category, ambient, True)
|
|
thermal_list.append(thermal)
|
|
else:
|
|
start, count = 0, 0
|
|
if category in thermal_profile:
|
|
start, count = thermal_profile[category]
|
|
if count == 0:
|
|
continue
|
|
if count == 1:
|
|
thermal = Thermal(category, 0, False)
|
|
thermal_list.append(thermal)
|
|
else:
|
|
if category == THERMAL_DEV_CATEGORY_PSU:
|
|
for index in range(count):
|
|
thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
|
|
thermal_list.append(thermal)
|
|
else:
|
|
for index in range(count):
|
|
thermal = Thermal(category, start + index, True)
|
|
thermal_list.append(thermal)
|
|
|
|
|
|
|
|
class Thermal(ThermalBase):
|
|
thermal_profile = None
|
|
thermal_algorithm_status = False
|
|
# Expect cooling level, used for caching the cooling level value before commiting to hardware
|
|
expect_cooling_level = None
|
|
# Expect cooling state
|
|
expect_cooling_state = None
|
|
# Last committed cooling level
|
|
last_set_cooling_level = None
|
|
last_set_cooling_state = None
|
|
last_set_psu_cooling_level = None
|
|
|
|
def __init__(self, category, index, has_index, dependency = None):
|
|
"""
|
|
index should be a string for category ambient and int for other categories
|
|
"""
|
|
if category == THERMAL_DEV_CATEGORY_AMBIENT:
|
|
self.name = thermal_ambient_name[index]
|
|
self.index = index
|
|
elif has_index:
|
|
self.name = thermal_name[category].format(index)
|
|
self.index = index
|
|
else:
|
|
self.name = thermal_name[category]
|
|
self.index = 0
|
|
|
|
self.category = category
|
|
self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE)
|
|
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
|
|
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
|
|
self.dependency = dependency
|
|
|
|
|
|
def get_name(self):
|
|
"""
|
|
Retrieves the name of the device
|
|
|
|
Returns:
|
|
string: The name of the device
|
|
"""
|
|
return self.name
|
|
|
|
|
|
@classmethod
|
|
def _read_generic_file(cls, filename, len):
|
|
"""
|
|
Read a generic file, returns the contents of the file
|
|
"""
|
|
result = None
|
|
try:
|
|
with open(filename, 'r') as fileobj:
|
|
result = fileobj.read().strip()
|
|
except Exception as e:
|
|
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
|
|
return result
|
|
|
|
|
|
def _get_file_from_api(self, api_name):
|
|
if self.category == THERMAL_DEV_CATEGORY_AMBIENT:
|
|
if api_name == THERMAL_API_GET_TEMPERATURE:
|
|
filename = thermal_ambient_apis[self.index]
|
|
else:
|
|
return None
|
|
else:
|
|
handler = thermal_api_handlers[self.category][api_name]
|
|
if self.category in thermal_device_categories_singleton:
|
|
filename = handler
|
|
else:
|
|
if handler:
|
|
filename = handler.format(self.index)
|
|
else:
|
|
return None
|
|
return join(HW_MGMT_THERMAL_ROOT, filename)
|
|
|
|
|
|
def get_temperature(self):
|
|
"""
|
|
Retrieves current temperature reading from thermal
|
|
|
|
Returns:
|
|
A float number of current temperature in Celsius up to nearest thousandth
|
|
of one degree Celsius, e.g. 30.125
|
|
"""
|
|
if self.dependency:
|
|
status, hint = self.dependency()
|
|
if not status:
|
|
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
|
|
return None
|
|
value_str = self._read_generic_file(self.temperature, 0)
|
|
if value_str is None:
|
|
return None
|
|
value_float = float(value_str)
|
|
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
|
|
return None
|
|
return value_float / 1000.0
|
|
|
|
|
|
def get_high_threshold(self):
|
|
"""
|
|
Retrieves the high threshold temperature of thermal
|
|
|
|
Returns:
|
|
A float number, the high threshold temperature of thermal in Celsius
|
|
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
|
"""
|
|
if self.high_threshold is None:
|
|
return None
|
|
if self.dependency:
|
|
status, hint = self.dependency()
|
|
if not status:
|
|
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
|
|
return None
|
|
value_str = self._read_generic_file(self.high_threshold, 0)
|
|
if value_str is None:
|
|
return None
|
|
value_float = float(value_str)
|
|
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
|
|
return None
|
|
return value_float / 1000.0
|
|
|
|
|
|
def get_high_critical_threshold(self):
|
|
"""
|
|
Retrieves the high critical threshold temperature of thermal
|
|
|
|
Returns:
|
|
A float number, the high critical threshold temperature of thermal in Celsius
|
|
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
|
"""
|
|
if self.high_critical_threshold is None:
|
|
return None
|
|
if self.dependency:
|
|
status, hint = self.dependency()
|
|
if not status:
|
|
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
|
|
return None
|
|
value_str = self._read_generic_file(self.high_critical_threshold, 0)
|
|
if value_str is None:
|
|
return None
|
|
value_float = float(value_str)
|
|
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
|
|
return None
|
|
return value_float / 1000.0
|
|
|
|
|
|
@classmethod
|
|
def _write_generic_file(cls, filename, content):
|
|
"""
|
|
Generic functions to write content to a specified file path if
|
|
the content has changed.
|
|
"""
|
|
try:
|
|
with open(filename, 'w+') as file_obj:
|
|
origin_content = file_obj.read()
|
|
if origin_content != content:
|
|
file_obj.write(content)
|
|
except Exception as e:
|
|
logger.log_info("Fail to write file {} due to {}".format(filename, repr(e)))
|
|
|
|
@classmethod
|
|
def set_thermal_algorithm_status(cls, status, force=True):
|
|
"""
|
|
Enable/disable kernel thermal algorithm.
|
|
When enable kernel thermal algorithm, kernel will adjust fan speed
|
|
according to thermal zones temperature. Please note that kernel will
|
|
only adjust fan speed when temperature across some "edge", e.g temperature
|
|
changes to exceed high threshold.
|
|
When disable kernel thermal algorithm, kernel no longer adjust fan speed.
|
|
We usually disable the algorithm when we want to set a fix speed. E.g, when
|
|
a fan unit is removed from system, we will set fan speed to 100% and disable
|
|
the algorithm to avoid it adjust the speed.
|
|
|
|
Returns:
|
|
True if thermal algorithm status changed.
|
|
"""
|
|
if not cls.thermal_profile:
|
|
raise Exception("Fail to get thermal profile for this switch")
|
|
|
|
if not force and cls.thermal_algorithm_status == status:
|
|
return False
|
|
|
|
cls.thermal_algorithm_status = status
|
|
content = "enabled" if status else "disabled"
|
|
policy = "step_wise" if status else "user_space"
|
|
cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content)
|
|
cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy)
|
|
|
|
if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile:
|
|
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE]
|
|
if count != 0:
|
|
for index in range(count):
|
|
cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content)
|
|
cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy)
|
|
|
|
if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile:
|
|
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX]
|
|
if count != 0:
|
|
for index in range(count):
|
|
cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content)
|
|
cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy)
|
|
return True
|
|
|
|
@classmethod
|
|
def get_min_allowed_cooling_level_by_thermal_zone(cls):
|
|
"""Get min allowed cooling level according to thermal zone status:
|
|
1. If temperature of all thermal zones is less than normal threshold, min allowed cooling level is
|
|
$MIN_COOLING_LEVEL_FOR_NORMAL = 2
|
|
2. If temperature of any thermal zone is greater than normal threshold, but no thermal zone temperature
|
|
is greater than high threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_HIGH = 4
|
|
3. Otherwise, there is no minimum allowed value and policy should not adjust cooling level
|
|
|
|
Returns:
|
|
int: minimum allowed cooling level
|
|
"""
|
|
min_allowed = MIN_COOLING_LEVEL_FOR_NORMAL
|
|
thermal_zone_present = False
|
|
try:
|
|
for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
|
|
thermal_zone_present = True
|
|
normal_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_NORMAL_THRESHOLD))
|
|
current = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_TEMPERATURE))
|
|
if current < normal_thresh - THERMAL_ZONE_HYSTERESIS:
|
|
continue
|
|
|
|
hot_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_HIGH_THRESHOLD))
|
|
if current < hot_thresh - THERMAL_ZONE_HYSTERESIS:
|
|
min_allowed = MIN_COOLING_LEVEL_FOR_HIGH
|
|
else:
|
|
min_allowed = None
|
|
break
|
|
except Exception as e:
|
|
logger.log_error('Failed to get thermal zone status for {} - {}'.format(thermal_zone_folder, repr(e)))
|
|
return None
|
|
|
|
return min_allowed if thermal_zone_present else None
|
|
|
|
@classmethod
|
|
def check_module_temperature_trustable(cls):
|
|
if not cls.thermal_profile:
|
|
raise Exception("Fail to get thermal profile for this switch")
|
|
|
|
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE]
|
|
for index in range(count):
|
|
fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start)
|
|
fault = cls._read_generic_file(fault_file_path, 0)
|
|
if fault.strip() != '0':
|
|
return 'untrust'
|
|
return 'trust'
|
|
|
|
@classmethod
|
|
def get_min_amb_temperature(cls):
|
|
fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT)
|
|
port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT)
|
|
|
|
# if there is any exception, let it raise
|
|
fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0))
|
|
port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0))
|
|
return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp
|
|
|
|
@classmethod
|
|
def set_cooling_level(cls, level):
|
|
"""
|
|
Change cooling level. The input level should be an integer value [1, 10].
|
|
1 means 10%, 2 means 20%, 10 means 100%.
|
|
"""
|
|
if cls.last_set_cooling_level != level:
|
|
utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True)
|
|
cls.last_set_cooling_level = level
|
|
|
|
@classmethod
|
|
def set_cooling_state(cls, state):
|
|
"""Change cooling state.
|
|
|
|
Args:
|
|
state (int): cooling state
|
|
"""
|
|
if cls.last_set_cooling_state != state:
|
|
utils.write_file(COOLING_STATE_PATH, state, raise_exception=True)
|
|
cls.last_set_cooling_state = state
|
|
|
|
@classmethod
|
|
def get_cooling_level(cls):
|
|
try:
|
|
return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True)
|
|
except (ValueError, IOError) as e:
|
|
raise RuntimeError("Failed to get cooling level - {}".format(e))
|
|
|
|
@classmethod
|
|
def set_expect_cooling_level(cls, expect_value):
|
|
"""During thermal policy running, cache the expect cooling level generated by policies. The max expect
|
|
cooling level will be committed to hardware.
|
|
|
|
Args:
|
|
expect_value (int): Expected cooling level value
|
|
"""
|
|
if cls.expect_cooling_level is None or cls.expect_cooling_level < expect_value:
|
|
cls.expect_cooling_level = int(expect_value)
|
|
|
|
@classmethod
|
|
def commit_cooling_level(cls, thermal_info_dict):
|
|
"""Commit cooling level to hardware. This will affect system fan and PSU fan speed.
|
|
|
|
Args:
|
|
thermal_info_dict (dict): Thermal information dictionary
|
|
"""
|
|
if cls.expect_cooling_level is not None:
|
|
cls.set_cooling_level(cls.expect_cooling_level)
|
|
|
|
if cls.expect_cooling_state is not None:
|
|
cls.set_cooling_state(cls.expect_cooling_state)
|
|
elif cls.expect_cooling_level is not None:
|
|
cls.set_cooling_state(cls.expect_cooling_level)
|
|
|
|
cls.expect_cooling_level = None
|
|
# We need to set system fan speed here because kernel will automaticlly adjust fan speed according to cooling level and cooling state
|
|
|
|
# Commit PSU fan speed with current state
|
|
from .thermal_infos import ChassisInfo
|
|
if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo):
|
|
cooling_level = cls.get_cooling_level()
|
|
if cls.last_set_psu_cooling_level == cooling_level:
|
|
return
|
|
speed = cooling_level * 10
|
|
chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis()
|
|
for psu in chassis.get_all_psus():
|
|
for psu_fan in psu.get_all_fans():
|
|
psu_fan.set_speed(speed)
|
|
cls.last_set_psu_cooling_level = cooling_level
|
|
|
|
@classmethod
|
|
def monitor_asic_themal_zone(cls):
|
|
"""This is a protection for asic thermal zone, if asic temperature is greater than hot threshold + THERMAL_ZONE_HYSTERESIS,
|
|
and if cooling state is not MAX, we need enforce the cooling state to MAX
|
|
"""
|
|
asic_temp = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_TEMPERATURE), raise_exception=True)
|
|
hot_thresh = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_HOT_THRESHOLD), raise_exception=True)
|
|
if asic_temp >= hot_thresh + THERMAL_ZONE_HYSTERESIS:
|
|
cls.expect_cooling_state = MAX_COOLING_LEVEL
|
|
else:
|
|
cls.expect_cooling_state = None
|