#!/usr/bin/env python ############################################################################# # Mellanox # # Module contains an implementation of SONiC Platform Base API and # provides the thermals data which are available in the platform # ############################################################################# try: from sonic_platform_base.thermal_base import ThermalBase from sonic_py_common.logger import Logger from os import listdir from os.path import isfile, join import io import os.path import glob from . import utils except ImportError as e: raise ImportError (str(e) + "- required module not found") # Global logger class instance logger = Logger() THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core" THERMAL_DEV_CATEGORY_CPU_PACK = "cpu_pack" THERMAL_DEV_CATEGORY_MODULE = "module" THERMAL_DEV_CATEGORY_PSU = "psu" THERMAL_DEV_CATEGORY_GEARBOX = "gearbox" THERMAL_DEV_CATEGORY_AMBIENT = "ambient" THERMAL_DEV_ASIC_AMBIENT = "asic_amb" THERMAL_DEV_FAN_AMBIENT = "fan_amb" THERMAL_DEV_PORT_AMBIENT = "port_amb" THERMAL_DEV_COMEX_AMBIENT = "comex_amb" THERMAL_DEV_BOARD_AMBIENT = "board_amb" THERMAL_API_GET_TEMPERATURE = "get_temperature" THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold" THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold" THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0 HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" THERMAL_ZONE_MODE = "thermal_zone_mode" THERMAL_ZONE_POLICY = "thermal_zone_policy" THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" THERMAL_ZONE_HOT_THRESHOLD = "temp_trip_hot" THERMAL_ZONE_HIGH_THRESHOLD = "temp_trip_high" THERMAL_ZONE_NORMAL_THRESHOLD = "temp_trip_norm" THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*' THERMAL_ZONE_HYSTERESIS = 5000 COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" # Min allowed cooling level when all thermal zones are in normal state MIN_COOLING_LEVEL_FOR_NORMAL = 2 # Min allowed cooling level when any thermal zone is in high state but no thermal zone is in emergency state MIN_COOLING_LEVEL_FOR_HIGH = 4 MAX_COOLING_LEVEL = 10 MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit" } thermal_api_handler_cpu_pack = { THERMAL_API_GET_TEMPERATURE:"cpu_pack", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max", THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit" } thermal_api_handler_module = { THERMAL_API_GET_TEMPERATURE:"module{}_temp_input", THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit", THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency" } thermal_api_handler_psu = { THERMAL_API_GET_TEMPERATURE:"psu{}_temp", THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max", THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_api_handler_gearbox = { THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", THERMAL_API_GET_HIGH_THRESHOLD:None, THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_ambient_apis = { THERMAL_DEV_ASIC_AMBIENT : "asic", THERMAL_DEV_PORT_AMBIENT : "port_amb", THERMAL_DEV_FAN_AMBIENT : "fan_amb", THERMAL_DEV_COMEX_AMBIENT : "comex_amb", THERMAL_DEV_BOARD_AMBIENT : "board_amb" } thermal_ambient_name = { THERMAL_DEV_ASIC_AMBIENT : "Ambient ASIC Temp", THERMAL_DEV_PORT_AMBIENT : "Ambient Port Side Temp", THERMAL_DEV_FAN_AMBIENT : "Ambient Fan Side Temp", THERMAL_DEV_COMEX_AMBIENT : "Ambient COMEX Temp", THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp" } thermal_api_handlers = { THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core, THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack, THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module, THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu, THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox } thermal_name = { THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp", THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp", THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp", THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp", THERMAL_DEV_CATEGORY_GEARBOX : "Gearbox {} Temp" } thermal_device_categories_all = [ THERMAL_DEV_CATEGORY_CPU_CORE, THERMAL_DEV_CATEGORY_CPU_PACK, THERMAL_DEV_CATEGORY_MODULE, THERMAL_DEV_CATEGORY_PSU, THERMAL_DEV_CATEGORY_AMBIENT, THERMAL_DEV_CATEGORY_GEARBOX ] thermal_device_categories_singleton = [ THERMAL_DEV_CATEGORY_CPU_PACK, THERMAL_DEV_CATEGORY_AMBIENT ] thermal_api_names = [ THERMAL_API_GET_TEMPERATURE, THERMAL_API_GET_HIGH_THRESHOLD ] platform_dict_thermal = {'x86_64-mlnx_msn2700-r0': 0, 'x86_64-mlnx_lssn2700-r0':0, 'x86_64-mlnx_msn2740-r0': 3, 'x86_64-mlnx_msn2100-r0': 1, 'x86_64-mlnx_msn2410-r0': 2, 'x86_64-mlnx_msn2010-r0': 4, 'x86_64-mlnx_msn3420-r0':9, 'x86_64-mlnx_msn3700-r0': 5, 'x86_64-mlnx_msn3700c-r0': 6, 'x86_64-mlnx_msn3800-r0': 7, 'x86_64-mlnx_msn4600c-r0':9, 'x86_64-mlnx_msn4700-r0': 8} thermal_profile_list = [ # 2700 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), THERMAL_DEV_CATEGORY_MODULE:(1, 32), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 2100 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 16), THERMAL_DEV_CATEGORY_PSU:(0, 0), THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT, ] ) }, # 2410 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), THERMAL_DEV_CATEGORY_MODULE:(1, 56), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT, ] ) }, # 2740 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 32), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT, ] ) }, # 2010 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 22), THERMAL_DEV_CATEGORY_PSU:(0, 0), THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT, ] ) }, # 3700 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 32), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 3700c { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), THERMAL_DEV_CATEGORY_MODULE:(1, 32), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 3800 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 64), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(1,32), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 4700 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 32), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 3420 { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), THERMAL_DEV_CATEGORY_MODULE:(1, 60), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) }, # 4600C { THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), THERMAL_DEV_CATEGORY_MODULE:(1, 64), THERMAL_DEV_CATEGORY_PSU:(1, 2), THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), THERMAL_DEV_CATEGORY_GEARBOX:(0,0), THERMAL_DEV_CATEGORY_AMBIENT:(0, [ THERMAL_DEV_ASIC_AMBIENT, THERMAL_DEV_COMEX_AMBIENT, THERMAL_DEV_PORT_AMBIENT, THERMAL_DEV_FAN_AMBIENT ] ) } ] def initialize_thermals(platform, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = platform_dict_thermal[platform] thermal_profile = thermal_profile_list[tp_index] Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] for ambient in ambient_list: thermal = Thermal(category, ambient, True) thermal_list.append(thermal) else: start, count = 0, 0 if category in thermal_profile: start, count = thermal_profile[category] if count == 0: continue if count == 1: thermal = Thermal(category, 0, False) thermal_list.append(thermal) else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status) thermal_list.append(thermal) else: for index in range(count): thermal = Thermal(category, start + index, True) thermal_list.append(thermal) class Thermal(ThermalBase): thermal_profile = None thermal_algorithm_status = False # Expect cooling level, used for caching the cooling level value before commiting to hardware expect_cooling_level = None # Expect cooling state expect_cooling_state = None # Last committed cooling level last_set_cooling_level = None last_set_cooling_state = None last_set_psu_cooling_level = None def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories """ if category == THERMAL_DEV_CATEGORY_AMBIENT: self.name = thermal_ambient_name[index] self.index = index elif has_index: self.name = thermal_name[category].format(index) self.index = index else: self.name = thermal_name[category] self.index = 0 self.category = category self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE) self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency def get_name(self): """ Retrieves the name of the device Returns: string: The name of the device """ return self.name @classmethod def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ result = None try: with open(filename, 'r') as fileobj: result = fileobj.read().strip() except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result def _get_file_from_api(self, api_name): if self.category == THERMAL_DEV_CATEGORY_AMBIENT: if api_name == THERMAL_API_GET_TEMPERATURE: filename = thermal_ambient_apis[self.index] else: return None else: handler = thermal_api_handlers[self.category][api_name] if self.category in thermal_device_categories_singleton: filename = handler else: if handler: filename = handler.format(self.index) else: return None return join(HW_MGMT_THERMAL_ROOT, filename) def get_temperature(self): """ Retrieves current temperature reading from thermal Returns: A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ if self.dependency: status, hint = self.dependency() if not status: logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None value_float = float(value_str) if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 def get_high_threshold(self): """ Retrieves the high threshold temperature of thermal Returns: A float number, the high threshold temperature of thermal in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ if self.high_threshold is None: return None if self.dependency: status, hint = self.dependency() if not status: logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None value_float = float(value_str) if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 def get_high_critical_threshold(self): """ Retrieves the high critical threshold temperature of thermal Returns: A float number, the high critical threshold temperature of thermal in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ if self.high_critical_threshold is None: return None if self.dependency: status, hint = self.dependency() if not status: logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) return None value_str = self._read_generic_file(self.high_critical_threshold, 0) if value_str is None: return None value_float = float(value_str) if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 @classmethod def _write_generic_file(cls, filename, content): """ Generic functions to write content to a specified file path if the content has changed. """ try: with open(filename, 'w+') as file_obj: origin_content = file_obj.read() if origin_content != content: file_obj.write(content) except Exception as e: logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) @classmethod def set_thermal_algorithm_status(cls, status, force=True): """ Enable/disable kernel thermal algorithm. When enable kernel thermal algorithm, kernel will adjust fan speed according to thermal zones temperature. Please note that kernel will only adjust fan speed when temperature across some "edge", e.g temperature changes to exceed high threshold. When disable kernel thermal algorithm, kernel no longer adjust fan speed. We usually disable the algorithm when we want to set a fix speed. E.g, when a fan unit is removed from system, we will set fan speed to 100% and disable the algorithm to avoid it adjust the speed. Returns: True if thermal algorithm status changed. """ if not cls.thermal_profile: raise Exception("Fail to get thermal profile for this switch") if not force and cls.thermal_algorithm_status == status: return False cls.thermal_algorithm_status = status content = "enabled" if status else "disabled" policy = "step_wise" if status else "user_space" cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] if count != 0: for index in range(count): cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] if count != 0: for index in range(count): cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) return True @classmethod def get_min_allowed_cooling_level_by_thermal_zone(cls): """Get min allowed cooling level according to thermal zone status: 1. If temperature of all thermal zones is less than normal threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_NORMAL = 2 2. If temperature of any thermal zone is greater than normal threshold, but no thermal zone temperature is greater than high threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_HIGH = 4 3. Otherwise, there is no minimum allowed value and policy should not adjust cooling level Returns: int: minimum allowed cooling level """ min_allowed = MIN_COOLING_LEVEL_FOR_NORMAL thermal_zone_present = False try: for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD): thermal_zone_present = True normal_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_NORMAL_THRESHOLD)) current = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_TEMPERATURE)) if current < normal_thresh - THERMAL_ZONE_HYSTERESIS: continue hot_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_HIGH_THRESHOLD)) if current < hot_thresh - THERMAL_ZONE_HYSTERESIS: min_allowed = MIN_COOLING_LEVEL_FOR_HIGH else: min_allowed = None break except Exception as e: logger.log_error('Failed to get thermal zone status for {} - {}'.format(thermal_zone_folder, repr(e))) return None return min_allowed if thermal_zone_present else None @classmethod def check_module_temperature_trustable(cls): if not cls.thermal_profile: raise Exception("Fail to get thermal profile for this switch") start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] for index in range(count): fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) fault = cls._read_generic_file(fault_file_path, 0) if fault.strip() != '0': return 'untrust' return 'trust' @classmethod def get_min_amb_temperature(cls): fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) # if there is any exception, let it raise fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp @classmethod def set_cooling_level(cls, level): """ Change cooling level. The input level should be an integer value [1, 10]. 1 means 10%, 2 means 20%, 10 means 100%. """ if cls.last_set_cooling_level != level: utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True) cls.last_set_cooling_level = level @classmethod def set_cooling_state(cls, state): """Change cooling state. Args: state (int): cooling state """ if cls.last_set_cooling_state != state: utils.write_file(COOLING_STATE_PATH, state, raise_exception=True) cls.last_set_cooling_state = state @classmethod def get_cooling_level(cls): try: return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True) except (ValueError, IOError) as e: raise RuntimeError("Failed to get cooling level - {}".format(e)) @classmethod def set_expect_cooling_level(cls, expect_value): """During thermal policy running, cache the expect cooling level generated by policies. The max expect cooling level will be committed to hardware. Args: expect_value (int): Expected cooling level value """ if cls.expect_cooling_level is None or cls.expect_cooling_level < expect_value: cls.expect_cooling_level = int(expect_value) @classmethod def commit_cooling_level(cls, thermal_info_dict): """Commit cooling level to hardware. This will affect system fan and PSU fan speed. Args: thermal_info_dict (dict): Thermal information dictionary """ if cls.expect_cooling_level is not None: cls.set_cooling_level(cls.expect_cooling_level) if cls.expect_cooling_state is not None: cls.set_cooling_state(cls.expect_cooling_state) elif cls.expect_cooling_level is not None: cls.set_cooling_state(cls.expect_cooling_level) cls.expect_cooling_level = None # We need to set system fan speed here because kernel will automaticlly adjust fan speed according to cooling level and cooling state # Commit PSU fan speed with current state from .thermal_infos import ChassisInfo if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): cooling_level = cls.get_cooling_level() if cls.last_set_psu_cooling_level == cooling_level: return speed = cooling_level * 10 chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() for psu in chassis.get_all_psus(): for psu_fan in psu.get_all_fans(): psu_fan.set_speed(speed) cls.last_set_psu_cooling_level = cooling_level @classmethod def monitor_asic_themal_zone(cls): """This is a protection for asic thermal zone, if asic temperature is greater than hot threshold + THERMAL_ZONE_HYSTERESIS, and if cooling state is not MAX, we need enforce the cooling state to MAX """ asic_temp = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_TEMPERATURE), raise_exception=True) hot_thresh = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_HOT_THRESHOLD), raise_exception=True) if asic_temp >= hot_thresh + THERMAL_ZONE_HYSTERESIS: cls.expect_cooling_state = MAX_COOLING_LEVEL else: cls.expect_cooling_state = None