sonic-buildimage/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Junchao-Mellanox 0c859fb036
[Mellanox] [202012] Fix issue: 4600C is using wrong thermal profile (#10258)
- Why I did it
4600C is using wrong thermal profile and it displays 2 CPU core thermal in show platform temperature output, there should be 4 CPU core thermal.

- How I did it
Change 4600C to use thermal profile 10.

- How to verify it
Manual test
2022-03-20 10:31:59 +02:00

762 lines
28 KiB
Python

#############################################################################
# Mellanox
#
# Module contains an implementation of SONiC Platform Base API and
# provides the thermals data which are available in the platform
#
#############################################################################
try:
from sonic_platform_base.thermal_base import ThermalBase
from sonic_py_common.logger import Logger
from os import listdir
from os.path import isfile, join
import io
import os.path
import glob
from . import utils
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
# Global logger class instance
logger = Logger()
THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core"
THERMAL_DEV_CATEGORY_CPU_PACK = "cpu_pack"
THERMAL_DEV_CATEGORY_MODULE = "module"
THERMAL_DEV_CATEGORY_PSU = "psu"
THERMAL_DEV_CATEGORY_GEARBOX = "gearbox"
THERMAL_DEV_CATEGORY_AMBIENT = "ambient"
THERMAL_DEV_ASIC_AMBIENT = "asic_amb"
THERMAL_DEV_FAN_AMBIENT = "fan_amb"
THERMAL_DEV_PORT_AMBIENT = "port_amb"
THERMAL_DEV_COMEX_AMBIENT = "comex_amb"
THERMAL_DEV_BOARD_AMBIENT = "board_amb"
THERMAL_API_GET_TEMPERATURE = "get_temperature"
THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold"
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold"
THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0
HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/"
THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/"
THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/"
THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/"
THERMAL_ZONE_MODE = "thermal_zone_mode"
THERMAL_ZONE_POLICY = "thermal_zone_policy"
THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp"
THERMAL_ZONE_HOT_THRESHOLD = "temp_trip_hot"
THERMAL_ZONE_HIGH_THRESHOLD = "temp_trip_high"
THERMAL_ZONE_NORMAL_THRESHOLD = "temp_trip_norm"
THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*'
THERMAL_ZONE_HYSTERESIS = 5000
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"
# Min allowed cooling level when all thermal zones are in normal state
MIN_COOLING_LEVEL_FOR_NORMAL = 2
# Min allowed cooling level when any thermal zone is in high state but no thermal zone is in emergency state
MIN_COOLING_LEVEL_FOR_HIGH = 4
MAX_COOLING_LEVEL = 10
MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault"
thermal_api_handler_asic = {
THERMAL_API_GET_TEMPERATURE: 'asic',
THERMAL_API_GET_HIGH_THRESHOLD: 'mlxsw/temp_trip_hot',
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD: 'mlxsw/temp_trip_crit'
}
thermal_api_handler_cpu_core = {
THERMAL_API_GET_TEMPERATURE:"cpu_core{}",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit"
}
thermal_api_handler_cpu_pack = {
THERMAL_API_GET_TEMPERATURE:"cpu_pack",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit"
}
thermal_api_handler_module = {
THERMAL_API_GET_TEMPERATURE:"module{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency"
}
thermal_api_handler_psu = {
THERMAL_API_GET_TEMPERATURE:"psu{}_temp",
THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:"mlxsw-gearbox{}/temp_trip_hot",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"mlxsw-gearbox{}/temp_trip_crit"
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : thermal_api_handler_asic,
THERMAL_DEV_PORT_AMBIENT : "port_amb",
THERMAL_DEV_FAN_AMBIENT : "fan_amb",
THERMAL_DEV_COMEX_AMBIENT : "comex_amb",
THERMAL_DEV_BOARD_AMBIENT : "board_amb"
}
thermal_ambient_name = {
THERMAL_DEV_ASIC_AMBIENT : 'ASIC',
THERMAL_DEV_PORT_AMBIENT : "Ambient Port Side Temp",
THERMAL_DEV_FAN_AMBIENT : "Ambient Fan Side Temp",
THERMAL_DEV_COMEX_AMBIENT : "Ambient COMEX Temp",
THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp"
}
thermal_api_handlers = {
THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core,
THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack,
THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module,
THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu,
THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox
}
thermal_name = {
THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp",
THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp",
THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp",
THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp",
THERMAL_DEV_CATEGORY_GEARBOX : "Gearbox {} Temp"
}
thermal_device_categories_all = [
THERMAL_DEV_CATEGORY_AMBIENT,
THERMAL_DEV_CATEGORY_CPU_PACK,
THERMAL_DEV_CATEGORY_CPU_CORE,
THERMAL_DEV_CATEGORY_GEARBOX,
]
thermal_device_categories_singleton = [
THERMAL_DEV_CATEGORY_CPU_PACK,
THERMAL_DEV_CATEGORY_AMBIENT
]
thermal_api_names = [
THERMAL_API_GET_TEMPERATURE,
THERMAL_API_GET_HIGH_THRESHOLD
]
platform_dict_thermal = {'x86_64-mlnx_msn2700-r0': 0, 'x86_64-mlnx_lssn2700-r0': 0, 'x86_64-mlnx_msn2740-r0': 3,
'x86_64-mlnx_msn2100-r0': 1, 'x86_64-mlnx_msn2410-r0': 2, 'x86_64-mlnx_msn2010-r0': 4,
'x86_64-mlnx_msn3420-r0': 9, 'x86_64-mlnx_msn3700-r0': 5, 'x86_64-mlnx_msn3700c-r0': 6,
'x86_64-mlnx_msn3800-r0': 7, 'x86_64-mlnx_msn4600-r0': 12, 'x86_64-mlnx_msn4600c-r0': 10,
'x86_64-mlnx_msn4700-r0': 8, 'x86_64-mlnx_msn4410-r0': 8}
thermal_profile_list = [
# 0 2700
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 1 2100
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 16),
THERMAL_DEV_CATEGORY_PSU:(0, 0),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 2 2410
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 56),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 3 2740
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 4 2010
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 22),
THERMAL_DEV_CATEGORY_PSU:(0, 0),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 5 3700
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 6 3700c
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 7 3800
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(1,32),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 8 4700
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 9 3420
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 60),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 10 4600C
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 11 4410
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 12 4600
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
}
]
def initialize_psu_thermals(platform, thermal_list, psu_index, dependency):
tp_index = platform_dict_thermal[platform]
thermal_profile = thermal_profile_list[tp_index]
_, count = thermal_profile[THERMAL_DEV_CATEGORY_PSU]
if count == 0:
return
thermal = Thermal(THERMAL_DEV_CATEGORY_PSU, psu_index, True, 1, dependency)
thermal_list.append(thermal)
def initialize_sfp_thermals(platform, thermal_list, sfp_index):
thermal = Thermal(THERMAL_DEV_CATEGORY_MODULE, sfp_index, True, 1)
thermal_list.append(thermal)
def initialize_chassis_thermals(platform, thermal_list):
# create thermal objects for all categories of sensors
tp_index = platform_dict_thermal[platform]
thermal_profile = thermal_profile_list[tp_index]
Thermal.thermal_profile = thermal_profile
position = 1
for category in thermal_device_categories_all:
if category == THERMAL_DEV_CATEGORY_AMBIENT:
count, ambient_list = thermal_profile[category]
for ambient in ambient_list:
thermal = Thermal(category, ambient, True, position)
thermal_list.append(thermal),
position += 1
else:
start, count = 0, 0
if category in thermal_profile:
start, count = thermal_profile[category]
if count == 0:
continue
if count == 1:
thermal = Thermal(category, 0, False, position)
thermal_list.append(thermal)
position += 1
else:
for index in range(count):
thermal = Thermal(category, start + index, True, position)
thermal_list.append(thermal)
position += 1
class Thermal(ThermalBase):
thermal_profile = None
thermal_algorithm_status = False
# Expect cooling level, used for caching the cooling level value before commiting to hardware
expect_cooling_level = None
# Expect cooling state
expect_cooling_state = None
# Last committed cooling level
last_set_cooling_level = None
last_set_cooling_state = None
last_set_psu_cooling_level = None
def __init__(self, category, index, has_index, position, dependency = None):
"""
index should be a string for category ambient and int for other categories
"""
super(Thermal, self).__init__()
if category == THERMAL_DEV_CATEGORY_AMBIENT:
self.name = thermal_ambient_name[index]
self.index = index
elif has_index:
self.name = thermal_name[category].format(index)
self.index = index
else:
self.name = thermal_name[category]
self.index = 0
self.category = category
self.position = position
self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE)
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency
def get_name(self):
"""
Retrieves the name of the device
Returns:
string: The name of the device
"""
return self.name
@classmethod
def _read_generic_file(cls, filename, len):
"""
Read a generic file, returns the contents of the file
"""
result = None
try:
with open(filename, 'r') as fileobj:
result = fileobj.read().strip()
except Exception as e:
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
return result
def _get_file_from_api(self, api_name):
if self.category == THERMAL_DEV_CATEGORY_AMBIENT:
handler = thermal_ambient_apis[self.index]
if isinstance(handler, str):
if api_name == THERMAL_API_GET_TEMPERATURE:
filename = thermal_ambient_apis[self.index]
else:
return None
elif isinstance(handler, dict):
filename = handler[api_name]
else:
return None
else:
handler = thermal_api_handlers[self.category][api_name]
if self.category in thermal_device_categories_singleton:
filename = handler
else:
if handler:
filename = handler.format(self.index)
else:
return None
return join(HW_MGMT_THERMAL_ROOT, filename)
def get_temperature(self):
"""
Retrieves current temperature reading from thermal
Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.temperature, 0)
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0
def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal
Returns:
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0
def get_high_critical_threshold(self):
"""
Retrieves the high critical threshold temperature of thermal
Returns:
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_critical_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0
def get_position_in_parent(self):
"""
Retrieves 1-based relative physical position in parent device
Returns:
integer: The 1-based relative physical position in parent device
"""
return self.position
def is_replaceable(self):
"""
Indicate whether this device is replaceable.
Returns:
bool: True if it is replaceable.
"""
return False
@classmethod
def _write_generic_file(cls, filename, content):
"""
Generic functions to write content to a specified file path if
the content has changed.
"""
try:
with open(filename, 'w+') as file_obj:
origin_content = file_obj.read()
if origin_content != content:
file_obj.write(content)
except Exception as e:
logger.log_info("Fail to write file {} due to {}".format(filename, repr(e)))
@classmethod
def set_thermal_algorithm_status(cls, status, force=True):
"""
Enable/disable kernel thermal algorithm.
When enable kernel thermal algorithm, kernel will adjust fan speed
according to thermal zones temperature. Please note that kernel will
only adjust fan speed when temperature across some "edge", e.g temperature
changes to exceed high threshold.
When disable kernel thermal algorithm, kernel no longer adjust fan speed.
We usually disable the algorithm when we want to set a fix speed. E.g, when
a fan unit is removed from system, we will set fan speed to 100% and disable
the algorithm to avoid it adjust the speed.
Returns:
True if thermal algorithm status changed.
"""
if not cls.thermal_profile:
raise Exception("Fail to get thermal profile for this switch")
if not force and cls.thermal_algorithm_status == status:
return False
cls.thermal_algorithm_status = status
content = "enabled" if status else "disabled"
policy = "step_wise" if status else "user_space"
cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content)
cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy)
if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile:
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE]
if count != 0:
for index in range(count):
cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content)
cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy)
if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile:
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX]
if count != 0:
for index in range(count):
cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content)
cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy)
return True
@classmethod
def get_min_allowed_cooling_level_by_thermal_zone(cls):
"""Get min allowed cooling level according to thermal zone status:
1. If temperature of all thermal zones is less than normal threshold, min allowed cooling level is
$MIN_COOLING_LEVEL_FOR_NORMAL = 2
2. If temperature of any thermal zone is greater than normal threshold, but no thermal zone temperature
is greater than high threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_HIGH = 4
3. Otherwise, there is no minimum allowed value and policy should not adjust cooling level
Returns:
int: minimum allowed cooling level
"""
min_allowed = MIN_COOLING_LEVEL_FOR_NORMAL
thermal_zone_present = False
try:
for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
thermal_zone_present = True
normal_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_NORMAL_THRESHOLD))
current = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_TEMPERATURE))
if current < normal_thresh - THERMAL_ZONE_HYSTERESIS:
continue
hot_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_HIGH_THRESHOLD))
if current < hot_thresh - THERMAL_ZONE_HYSTERESIS:
min_allowed = MIN_COOLING_LEVEL_FOR_HIGH
else:
min_allowed = None
break
except Exception as e:
logger.log_error('Failed to get thermal zone status for {} - {}'.format(thermal_zone_folder, repr(e)))
return None
return min_allowed if thermal_zone_present else None
@classmethod
def check_module_temperature_trustable(cls):
if not cls.thermal_profile:
raise Exception("Fail to get thermal profile for this switch")
start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE]
for index in range(count):
fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start)
fault = cls._read_generic_file(fault_file_path, 0)
if fault.strip() != '0':
return 'untrust'
return 'trust'
@classmethod
def get_min_amb_temperature(cls):
fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT)
port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT)
# if there is any exception, let it raise
fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0))
port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0))
return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp
@classmethod
def set_cooling_level(cls, level):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if cls.last_set_cooling_level != level:
utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True)
cls.last_set_cooling_level = level
@classmethod
def set_cooling_state(cls, state):
"""Change cooling state.
Args:
state (int): cooling state
"""
if cls.last_set_cooling_state != state:
utils.write_file(COOLING_STATE_PATH, state, raise_exception=True)
cls.last_set_cooling_state = state
@classmethod
def get_cooling_level(cls):
try:
return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True)
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))
@classmethod
def set_expect_cooling_level(cls, expect_value):
"""During thermal policy running, cache the expect cooling level generated by policies. The max expect
cooling level will be committed to hardware.
Args:
expect_value (int): Expected cooling level value
"""
if cls.expect_cooling_level is None or cls.expect_cooling_level < expect_value:
cls.expect_cooling_level = int(expect_value)
@classmethod
def commit_cooling_level(cls, thermal_info_dict):
"""Commit cooling level to hardware. This will affect system fan and PSU fan speed.
Args:
thermal_info_dict (dict): Thermal information dictionary
"""
if cls.expect_cooling_level is not None:
cls.set_cooling_level(cls.expect_cooling_level)
if cls.expect_cooling_state is not None:
cls.set_cooling_state(cls.expect_cooling_state)
elif cls.expect_cooling_level is not None:
cls.set_cooling_state(cls.expect_cooling_level)
cls.expect_cooling_level = None
# We need to set system fan speed here because kernel will automaticlly adjust fan speed according to cooling level and cooling state
# Commit PSU fan speed with current state
from .thermal_infos import ChassisInfo
if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo):
cooling_level = cls.get_cooling_level()
if cls.last_set_psu_cooling_level == cooling_level:
return
speed = cooling_level * 10
chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis()
for psu in chassis.get_all_psus():
for psu_fan in psu.get_all_fans():
psu_fan.set_speed(speed)
cls.last_set_psu_cooling_level = cooling_level
@classmethod
def monitor_asic_themal_zone(cls):
"""This is a protection for asic thermal zone, if asic temperature is greater than hot threshold + THERMAL_ZONE_HYSTERESIS,
and if cooling state is not MAX, we need enforce the cooling state to MAX
"""
asic_temp = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_TEMPERATURE), raise_exception=True)
hot_thresh = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_HOT_THRESHOLD), raise_exception=True)
if asic_temp >= hot_thresh + THERMAL_ZONE_HYSTERESIS:
cls.expect_cooling_state = MAX_COOLING_LEVEL
else:
cls.expect_cooling_state = None