[Mellanox] Optimize thermal control policies (#9452)

- Why I did it
Optimize thermal control policies to simplify the logic and add more protection code in policies to make sure it works even if kernel algorithm does not work.

- How I did it
Reduce unused thermal policies
Add timely ASIC temperature check in thermal policy to make sure ASIC temperature and fan speed is coordinated
Minimum allowed fan speed now is calculated by max of the expected fan speed among all policies
Move some logic from fan.py to thermal.py to make it more readable

- How to verify it
1. Manual test
2. Regression
This commit is contained in:
Junchao-Mellanox 2022-01-19 17:44:37 +08:00 committed by GitHub
parent bc56e064c3
commit 4ae504a813
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 423 additions and 555 deletions

View File

@ -227,7 +227,7 @@ class DeviceDataManager:
platform_data = DEVICE_DATA.get(cls.get_platform_name(), None) platform_data = DEVICE_DATA.get(cls.get_platform_name(), None)
if not platform_data: if not platform_data:
return None return None
thermal_data = platform_data.get('thermal', None) thermal_data = platform_data.get('thermal', None)
if not thermal_data: if not thermal_data:
return None return None
@ -240,7 +240,7 @@ class DeviceDataManager:
platform_data = DEVICE_DATA.get(cls.get_platform_name(), None) platform_data = DEVICE_DATA.get(cls.get_platform_name(), None)
if not platform_data: if not platform_data:
return None return None
thermal_data = platform_data.get('thermal', None) thermal_data = platform_data.get('thermal', None)
if not thermal_data: if not thermal_data:
return None return None
@ -258,7 +258,7 @@ class DeviceDataManager:
platform_data = DEVICE_DATA.get(cls.get_platform_name(), None) platform_data = DEVICE_DATA.get(cls.get_platform_name(), None)
if not platform_data: if not platform_data:
return 0 return 0
sfp_data = platform_data.get('sfp', None) sfp_data = platform_data.get('sfp', None)
if not sfp_data: if not sfp_data:
return 0 return 0

View File

@ -30,6 +30,7 @@ try:
from sonic_py_common.logger import Logger from sonic_py_common.logger import Logger
from .led import ComponentFaultyIndicator from .led import ComponentFaultyIndicator
from . import utils from . import utils
from .thermal import Thermal
except ImportError as e: except ImportError as e:
raise ImportError (str(e) + "- required module not found") raise ImportError (str(e) + "- required module not found")
@ -44,13 +45,9 @@ CONFIG_PATH = "/var/run/hw-management/config"
FAN_DIR = "/var/run/hw-management/thermal/fan{}_dir" FAN_DIR = "/var/run/hw-management/thermal/fan{}_dir"
FAN_DIR_VALUE_EXHAUST = 0 FAN_DIR_VALUE_EXHAUST = 0
FAN_DIR_VALUE_INTAKE = 1 FAN_DIR_VALUE_INTAKE = 1
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"
class MlnxFan(FanBase): class MlnxFan(FanBase):
MIN_VALID_COOLING_LEVEL = 1
MAX_VALID_COOLING_LEVEL = 10
def __init__(self, fan_index, position): def __init__(self, fan_index, position):
super(MlnxFan, self).__init__() super(MlnxFan, self).__init__()
self.index = fan_index + 1 self.index = fan_index + 1
@ -88,7 +85,7 @@ class MlnxFan(FanBase):
fan module status LED fan module status LED
Returns: Returns:
bool: True if set success, False if fail. bool: True if set success, False if fail.
""" """
return self.led.set_status(color) return self.led.set_status(color)
@ -128,37 +125,6 @@ class MlnxFan(FanBase):
""" """
return False return False
@classmethod
def set_cooling_level(cls, level, cur_state):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL:
raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format(
cls.MIN_VALID_COOLING_LEVEL,
cls.MAX_VALID_COOLING_LEVEL,
level
))
try:
# Reset FAN cooling level vector. According to low level team,
# if we need set cooling level to X, we need first write a (10+X)
# to cooling_cur_state file to reset the cooling level vector.
utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True)
# We need set cooling level after resetting the cooling level vector
utils.write_file(COOLING_STATE_PATH, cur_state, raise_exception=True)
except (ValueError, IOError) as e:
raise RuntimeError("Failed to set cooling level - {}".format(e))
@classmethod
def get_cooling_level(cls):
try:
return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True)
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))
class PsuFan(MlnxFan): class PsuFan(MlnxFan):
# PSU fan speed vector # PSU fan speed vector
@ -189,7 +155,7 @@ class PsuFan(MlnxFan):
depending on fan direction depending on fan direction
Notes: Notes:
What Mellanox calls forward: What Mellanox calls forward:
Air flows from fans side to QSFP side, for example: MSN2700-CS2F Air flows from fans side to QSFP side, for example: MSN2700-CS2F
which means intake in community which means intake in community
What Mellanox calls reverse: What Mellanox calls reverse:
@ -228,7 +194,7 @@ class PsuFan(MlnxFan):
""" """
try: try:
# Get PSU fan target speed according to current system cooling level # Get PSU fan target speed according to current system cooling level
cooling_level = self.get_cooling_level() cooling_level = Thermal.get_cooling_level()
return int(self.PSU_FAN_SPEED[cooling_level], 16) return int(self.PSU_FAN_SPEED[cooling_level], 16)
except Exception: except Exception:
return self.get_speed() return self.get_speed()
@ -242,7 +208,7 @@ class PsuFan(MlnxFan):
in the range 0 (off) to 100 (full speed) in the range 0 (off) to 100 (full speed)
Returns: Returns:
bool: True if set success, False if fail. bool: True if set success, False if fail.
""" """
if not self.get_presence(): if not self.get_presence():
return False return False
@ -264,12 +230,9 @@ class PsuFan(MlnxFan):
class Fan(MlnxFan): class Fan(MlnxFan):
"""Platform-specific Fan class""" """Platform-specific Fan class"""
min_cooling_level = 2
def __init__(self, fan_index, fan_drawer, position): def __init__(self, fan_index, fan_drawer, position):
super(Fan, self).__init__(fan_index, position) super(Fan, self).__init__(fan_index, position)
self.fan_drawer = fan_drawer self.fan_drawer = fan_drawer
self.led = ComponentFaultyIndicator(self.fan_drawer.get_led()) self.led = ComponentFaultyIndicator(self.fan_drawer.get_led())
@ -278,7 +241,7 @@ class Fan(MlnxFan):
self.fan_speed_set_path = os.path.join(FAN_PATH, "fan{}_speed_set".format(self.index)) self.fan_speed_set_path = os.path.join(FAN_PATH, "fan{}_speed_set".format(self.index))
self.fan_max_speed_path = os.path.join(FAN_PATH, "fan{}_max".format(self.index)) self.fan_max_speed_path = os.path.join(FAN_PATH, "fan{}_max".format(self.index))
self.fan_min_speed_path = os.path.join(FAN_PATH, "fan{}_min".format(self.index)) self.fan_min_speed_path = os.path.join(FAN_PATH, "fan{}_min".format(self.index))
self.fan_status_path = os.path.join(FAN_PATH, "fan{}_fault".format(self.index)) self.fan_status_path = os.path.join(FAN_PATH, "fan{}_fault".format(self.index))
def get_direction(self): def get_direction(self):
@ -290,7 +253,7 @@ class Fan(MlnxFan):
depending on fan direction depending on fan direction
Notes: Notes:
What Mellanox calls forward: What Mellanox calls forward:
Air flows from fans side to QSFP side, for example: MSN2700-CS2F Air flows from fans side to QSFP side, for example: MSN2700-CS2F
which means intake in community which means intake in community
What Mellanox calls reverse: What Mellanox calls reverse:
@ -340,16 +303,11 @@ class Fan(MlnxFan):
in the range 0 (off) to 100 (full speed) in the range 0 (off) to 100 (full speed)
Returns: Returns:
bool: True if set success, False if fail. bool: True if set success, False if fail.
""" """
status = True status = True
try: try:
cooling_level = int(speed // 10)
if cooling_level < self.min_cooling_level:
cooling_level = self.min_cooling_level
speed = self.min_cooling_level * 10
self.set_cooling_level(cooling_level, cooling_level)
pwm = int(PWM_MAX*speed/100.0) pwm = int(PWM_MAX*speed/100.0)
utils.write_file(self.fan_speed_set_path, pwm, raise_exception=True) utils.write_file(self.fan_speed_set_path, pwm, raise_exception=True)
except (ValueError, IOError): except (ValueError, IOError):

View File

@ -38,8 +38,8 @@ except ImportError as e:
logger = Logger() logger = Logger()
""" """
The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and
high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types
of thermal object: single and indexable: of thermal object: single and indexable:
1. Single. Such as asic, port_amb... 1. Single. Such as asic, port_amb...
2. Indexablt. Such as cpu_core0, cpu_core1, psu1_temp, psu2_temp 2. Indexablt. Such as cpu_core0, cpu_core1, psu1_temp, psu2_temp
@ -63,7 +63,7 @@ THERMAL_NAMING_RULE = {
"high_critical_threshold": "module{}_temp_emergency", "high_critical_threshold": "module{}_temp_emergency",
"type": "indexable" "type": "indexable"
}, },
"psu thermals": "psu thermals":
{ {
"name": "PSU-{} Temp", "name": "PSU-{} Temp",
"temperature": "psu{}_temp", "temperature": "psu{}_temp",
@ -131,13 +131,23 @@ THERMAL_NAMING_RULE = {
} }
CHASSIS_THERMAL_SYSFS_FOLDER = '/run/hw-management/thermal' CHASSIS_THERMAL_SYSFS_FOLDER = '/run/hw-management/thermal'
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"
THERMAL_ZONE_ASIC_PATH = '/var/run/hw-management/thermal/mlxsw/'
THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*' THERMAL_ZONE_FOLDER_WILDCARD = '/run/hw-management/thermal/mlxsw*'
THERMAL_ZONE_POLICY_FILE = 'thermal_zone_policy' THERMAL_ZONE_HIGH_THRESHOLD = 'temp_trip_high'
THERMAL_ZONE_HOT_THRESHOLD = 'temp_trip_hot'
THERMAL_ZONE_NORMAL_THRESHOLD = 'temp_trip_norm'
THERMAL_ZONE_MODE_FILE = 'thermal_zone_mode' THERMAL_ZONE_MODE_FILE = 'thermal_zone_mode'
THERMAL_ZONE_POLICY_FILE = 'thermal_zone_policy'
THERMAL_ZONE_TEMP_FILE = 'thermal_zone_temp' THERMAL_ZONE_TEMP_FILE = 'thermal_zone_temp'
THERMAL_ZONE_THRESHOLD_FILE = 'temp_trip_high' THERMAL_ZONE_HYSTERESIS = 5000
MODULE_TEMP_FAULT_WILDCARRD = '/run/hw-management/thermal/module*_temp_fault' MODULE_TEMP_FAULT_WILDCARRD = '/run/hw-management/thermal/module*_temp_fault'
MAX_AMBIENT_TEMP = 120 MAX_AMBIENT_TEMP = 120
# Min allowed cooling level when all thermal zones are in normal state
MIN_COOLING_LEVEL_FOR_NORMAL = 2
# Min allowed cooling level when any thermal zone is in high state but no thermal zone is in emergency state
MIN_COOLING_LEVEL_FOR_HIGH = 4
MAX_COOLING_LEVEL = 10
def initialize_chassis_thermals(): def initialize_chassis_thermals():
@ -172,7 +182,7 @@ def initialize_psu_thermal(psu_index, presence_cb):
Args: Args:
psu_index (int): PSU index, 0-based psu_index (int): PSU index, 0-based
presence_cb (function): A callback function to indicate if the thermal is present. When removing a PSU, the related presence_cb (function): A callback function to indicate if the thermal is present. When removing a PSU, the related
thermal sysfs files will be removed from system, presence_cb is used to check such situation and avoid printing thermal sysfs files will be removed from system, presence_cb is used to check such situation and avoid printing
error logs. error logs.
Returns: Returns:
@ -263,6 +273,14 @@ def _check_thermal_sysfs_existence(file_path):
class Thermal(ThermalBase): class Thermal(ThermalBase):
thermal_algorithm_status = False thermal_algorithm_status = False
# Expect cooling level, used for caching the cooling level value before commiting to hardware
expect_cooling_level = None
# Expect cooling state
expect_cooling_state = None
# Last committed cooling level
last_set_cooling_level = None
last_set_cooling_state = None
last_set_psu_cooling_level = None
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position):
""" """
@ -290,7 +308,7 @@ class Thermal(ThermalBase):
Returns: Returns:
A float number of current temperature in Celsius up to nearest thousandth A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125 of one degree Celsius, e.g. 30.125
""" """
value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info) value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None return value / 1000.0 if (value is not None and value != 0) else None
@ -346,8 +364,8 @@ class Thermal(ThermalBase):
only adjust fan speed when temperature across some "edge", e.g temperature only adjust fan speed when temperature across some "edge", e.g temperature
changes to exceed high threshold. changes to exceed high threshold.
When disable kernel thermal algorithm, kernel no longer adjust fan speed. When disable kernel thermal algorithm, kernel no longer adjust fan speed.
We usually disable the algorithm when we want to set a fix speed. E.g, when We usually disable the algorithm when we want to set a fix speed. E.g, when
a fan unit is removed from system, we will set fan speed to 100% and disable a fan unit is removed from system, we will set fan speed to 100% and disable
the algorithm to avoid it adjust the speed. the algorithm to avoid it adjust the speed.
Returns: Returns:
@ -364,35 +382,41 @@ class Thermal(ThermalBase):
utils.write_file(policy_file, policy) utils.write_file(policy_file, policy)
mode_file = os.path.join(thermal_zone_folder, THERMAL_ZONE_MODE_FILE) mode_file = os.path.join(thermal_zone_folder, THERMAL_ZONE_MODE_FILE)
utils.write_file(mode_file, mode) utils.write_file(mode_file, mode)
return True return True
@classmethod @classmethod
def check_thermal_zone_temperature(cls): def get_min_allowed_cooling_level_by_thermal_zone(cls):
""" """Get min allowed cooling level according to thermal zone status:
Check thermal zone current temperature with normal temperature 1. If temperature of all thermal zones is less than normal threshold, min allowed cooling level is
$MIN_COOLING_LEVEL_FOR_NORMAL = 2
2. If temperature of any thermal zone is greater than normal threshold, but no thermal zone temperature
is greater than high threshold, min allowed cooling level is $MIN_COOLING_LEVEL_FOR_HIGH = 4
3. Otherwise, there is no minimum allowed value and policy should not adjust cooling level
Returns: Returns:
True if all thermal zones current temperature less or equal than normal temperature int: minimum allowed cooling level
""" """
for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD): min_allowed = MIN_COOLING_LEVEL_FOR_NORMAL
if not cls._check_thermal_zone_temperature(thermal_zone_folder): thermal_zone_present = False
return False
return True
@classmethod
def _check_thermal_zone_temperature(cls, thermal_zone_path):
threshold_path = os.path.join(thermal_zone_path, THERMAL_ZONE_THRESHOLD_FILE)
current_temp_path = os.path.join(thermal_zone_path, THERMAL_ZONE_TEMP_FILE)
try: try:
threshold = utils.read_int_from_file(threshold_path, raise_exception=True) for thermal_zone_folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
current = utils.read_int_from_file(current_temp_path, raise_exception=True) thermal_zone_present = True
return current <= threshold normal_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_NORMAL_THRESHOLD))
current = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_TEMP_FILE))
if current < normal_thresh - THERMAL_ZONE_HYSTERESIS:
continue
hot_thresh = utils.read_int_from_file(os.path.join(thermal_zone_folder, THERMAL_ZONE_HIGH_THRESHOLD))
if current < hot_thresh - THERMAL_ZONE_HYSTERESIS:
min_allowed = MIN_COOLING_LEVEL_FOR_HIGH
else:
min_allowed = None
break
except Exception as e: except Exception as e:
logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) logger.log_error('Failed to get thermal zone status for {} - {}'.format(thermal_zone_folder, repr(e)))
return False return None
return min_allowed if thermal_zone_present else None
@classmethod @classmethod
def check_module_temperature_trustable(cls): def check_module_temperature_trustable(cls):
@ -416,6 +440,85 @@ class Thermal(ThermalBase):
logger.log_error('Failed to get minimum ambient temperature, use pessimistic instead') logger.log_error('Failed to get minimum ambient temperature, use pessimistic instead')
return MAX_AMBIENT_TEMP return MAX_AMBIENT_TEMP
@classmethod
def set_cooling_level(cls, level):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if cls.last_set_cooling_level != level:
utils.write_file(COOLING_STATE_PATH, level + 10, raise_exception=True)
cls.last_set_cooling_level = level
@classmethod
def set_cooling_state(cls, state):
"""Change cooling state.
Args:
state (int): cooling state
"""
if cls.last_set_cooling_state != state:
utils.write_file(COOLING_STATE_PATH, state, raise_exception=True)
cls.last_set_cooling_state = state
@classmethod
def get_cooling_level(cls):
try:
return utils.read_int_from_file(COOLING_STATE_PATH, raise_exception=True)
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))
@classmethod
def set_expect_cooling_level(cls, expect_value):
"""During thermal policy running, cache the expect cooling level generated by policies. The max expect
cooling level will be committed to hardware.
Args:
expect_value (int): Expected cooling level value
"""
if cls.expect_cooling_level is None or cls.expect_cooling_level < expect_value:
cls.expect_cooling_level = int(expect_value)
@classmethod
def commit_cooling_level(cls, thermal_info_dict):
"""Commit cooling level to hardware. This will affect system fan and PSU fan speed.
Args:
thermal_info_dict (dict): Thermal information dictionary
"""
if cls.expect_cooling_level is not None:
cls.set_cooling_level(cls.expect_cooling_level)
if cls.expect_cooling_state is not None:
cls.set_cooling_state(cls.expect_cooling_state)
elif cls.expect_cooling_level is not None:
cls.set_cooling_state(cls.expect_cooling_level)
cls.expect_cooling_level = None
# We need to set system fan speed here because kernel will automaticlly adjust fan speed according to cooling level and cooling state
# Commit PSU fan speed with current state
from .thermal_infos import ChassisInfo
if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo):
cooling_level = cls.get_cooling_level()
if cls.last_set_psu_cooling_level == cooling_level:
return
speed = cooling_level * 10
chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis()
for psu in chassis.get_all_psus():
for psu_fan in psu.get_all_fans():
psu_fan.set_speed(speed)
cls.last_set_psu_cooling_level = cooling_level
@classmethod
def monitor_asic_themal_zone(cls):
"""This is a protection for asic thermal zone, if asic temperature is greater than hot threshold + THERMAL_ZONE_HYSTERESIS,
and if cooling state is not MAX, we need enforce the cooling state to MAX
"""
asic_temp = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_TEMP_FILE), raise_exception=True)
hot_thresh = utils.read_int_from_file(os.path.join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_HOT_THRESHOLD), raise_exception=True)
if asic_temp >= hot_thresh + THERMAL_ZONE_HYSTERESIS:
cls.expect_cooling_state = MAX_COOLING_LEVEL
else:
cls.expect_cooling_state = None
class RemovableThermal(Thermal): class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
@ -428,7 +531,7 @@ class RemovableThermal(Thermal):
Returns: Returns:
A float number of current temperature in Celsius up to nearest thousandth A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125 of one degree Celsius, e.g. 30.125
""" """
status, hint = self.presence_cb() status, hint = self.presence_cb()
if not status: if not status:

View File

@ -16,7 +16,7 @@
# #
from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase
from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object
from .thermal import logger from .thermal import Thermal
class SetFanSpeedAction(ThermalPolicyActionBase): class SetFanSpeedAction(ThermalPolicyActionBase):
@ -64,120 +64,28 @@ class SetAllFanSpeedAction(SetFanSpeedAction):
:param thermal_info_dict: A dictionary stores all thermal information. :param thermal_info_dict: A dictionary stores all thermal information.
:return: :return:
""" """
from .thermal_infos import FanInfo Thermal.set_expect_cooling_level(self.speed / 10)
if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo):
fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME]
for fan in fan_info_obj.get_presence_fans():
fan.set_speed(self.speed)
logger.log_info('Set all system FAN speed to {}'.format(self.speed))
SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed)
@classmethod
def set_psu_fan_speed(cls, thermal_info_dict, speed):
from .thermal_infos import ChassisInfo
if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo):
chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis()
for psu in chassis.get_all_psus():
for psu_fan in psu.get_all_fans():
psu_fan.set_speed(speed)
@thermal_json_object('fan.all.check_and_set_speed')
class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction):
"""
Action to check thermal zone temperature and recover speed for all fans
"""
def execute(self, thermal_info_dict):
"""
Check thermal zone and set speed for all fans
:param thermal_info_dict: A dictionary stores all thermal information.
:return:
"""
from .thermal import Thermal
if Thermal.check_thermal_zone_temperature():
SetAllFanSpeedAction.execute(self, thermal_info_dict)
@thermal_json_object('thermal_control.control')
class ControlThermalAlgoAction(ThermalPolicyActionBase):
"""
Action to control the thermal control algorithm
"""
# JSON field definition
JSON_FIELD_STATUS = 'status'
def __init__(self):
self.status = True
def load_from_json(self, json_obj):
"""
Construct ControlThermalAlgoAction via JSON. JSON example:
{
"type": "thermal_control.control"
"status": "true"
}
:param json_obj: A JSON object representing a ControlThermalAlgoAction action.
:return:
"""
if ControlThermalAlgoAction.JSON_FIELD_STATUS in json_obj:
status_str = json_obj[ControlThermalAlgoAction.JSON_FIELD_STATUS].lower()
if status_str == 'true':
self.status = True
elif status_str == 'false':
self.status = False
else:
raise ValueError('Invalid {} field value, please specify true of false'.
format(ControlThermalAlgoAction.JSON_FIELD_STATUS))
else:
raise ValueError('ControlThermalAlgoAction '
'missing mandatory field {} in JSON policy file'.
format(ControlThermalAlgoAction.JSON_FIELD_STATUS))
def execute(self, thermal_info_dict):
"""
Disable thermal control algorithm
:param thermal_info_dict: A dictionary stores all thermal information.
:return:
"""
from .thermal_infos import FanInfo
from .thermal import Thermal
from .thermal_conditions import UpdateCoolingLevelToMinCondition
from .fan import Fan
status_changed = Thermal.set_thermal_algorithm_status(self.status, False)
# Only update cooling level if thermal algorithm status changed
if status_changed:
if self.status:
# Check thermal zone temperature, if all thermal zone temperature
# back to normal, set it to minimum allowed speed to
# save power
UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict)
logger.log_info('Changed thermal algorithm status to {}'.format(self.status))
@thermal_json_object('thermal.recover') @thermal_json_object('thermal.recover')
class ThermalRecoverAction(ThermalPolicyActionBase): class ThermalRecoverAction(ThermalPolicyActionBase):
def execute(self, thermal_info_dict):
UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict)
class ChangeMinCoolingLevelAction(ThermalPolicyActionBase):
UNKNOWN_SKU_COOLING_LEVEL = 6 UNKNOWN_SKU_COOLING_LEVEL = 6
def execute(self, thermal_info_dict): def execute(self, thermal_info_dict):
from .device_data import DeviceDataManager from .device_data import DeviceDataManager
from .fan import Fan from .thermal import MAX_COOLING_LEVEL, MIN_COOLING_LEVEL_FOR_HIGH, logger
from .thermal_infos import ChassisInfo Thermal.monitor_asic_themal_zone()
from .thermal_conditions import MinCoolingLevelChangeCondition
from .thermal_conditions import UpdateCoolingLevelToMinCondition
# Calculate dynamic minimum cooling level
dynamic_min_cooling_level = None
minimum_table = DeviceDataManager.get_minimum_table() minimum_table = DeviceDataManager.get_minimum_table()
if not minimum_table: if not minimum_table:
Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL # If there is no minimum_table defined, set dynamic_min_cooling_level to default value
dynamic_min_cooling_level = ThermalRecoverAction.UNKNOWN_SKU_COOLING_LEVEL
else: else:
trust_state = MinCoolingLevelChangeCondition.trust_state trust_state = Thermal.check_module_temperature_trustable()
temperature = MinCoolingLevelChangeCondition.temperature temperature = Thermal.get_min_amb_temperature()
temperature = int(temperature / 1000)
minimum_table = minimum_table['unk_{}'.format(trust_state)] minimum_table = minimum_table['unk_{}'.format(trust_state)]
for key, cooling_level in minimum_table.items(): for key, cooling_level in minimum_table.items():
@ -185,41 +93,19 @@ class ChangeMinCoolingLevelAction(ThermalPolicyActionBase):
temp_min = int(temp_range[0].strip()) temp_min = int(temp_range[0].strip())
temp_max = int(temp_range[1].strip()) temp_max = int(temp_range[1].strip())
if temp_min <= temperature <= temp_max: if temp_min <= temperature <= temp_max:
Fan.min_cooling_level = cooling_level - 10 dynamic_min_cooling_level = cooling_level - 10
break break
current_cooling_level = Fan.get_cooling_level() if not dynamic_min_cooling_level:
if current_cooling_level < Fan.min_cooling_level: # Should not go to this branch, just in case
Fan.set_cooling_level(Fan.min_cooling_level, Fan.min_cooling_level) logger.log_error('Failed to get dynamic minimum cooling level')
SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) dynamic_min_cooling_level = MAX_COOLING_LEVEL
if Thermal.last_set_cooling_level is not None and dynamic_min_cooling_level > Thermal.last_set_cooling_level and dynamic_min_cooling_level >= MIN_COOLING_LEVEL_FOR_HIGH:
# No need to check thermal zone as dynamic_min_cooling_level is greater than previous value and MIN_COOLING_LEVEL_FOR_HIGH
Thermal.set_expect_cooling_level(dynamic_min_cooling_level)
else: else:
Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) min_cooling_level_by_tz = Thermal.get_min_allowed_cooling_level_by_thermal_zone()
UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) if min_cooling_level_by_tz is not None:
cooling_level = max(dynamic_min_cooling_level, min_cooling_level_by_tz)
Thermal.set_expect_cooling_level(cooling_level)
class UpdatePsuFanSpeedAction(ThermalPolicyActionBase):
def execute(self, thermal_info_dict):
from .thermal_conditions import CoolingLevelChangeCondition
SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10)
class UpdateCoolingLevelToMinAction(ThermalPolicyActionBase):
def execute(self, thermal_info_dict):
self.update_cooling_level_to_minimum(thermal_info_dict)
@classmethod
def update_cooling_level_to_minimum(cls, thermal_info_dict):
from .fan import Fan
from .thermal import Thermal
from .thermal_conditions import UpdateCoolingLevelToMinCondition
from .thermal_infos import FanInfo
if Thermal.check_thermal_zone_temperature():
fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME]
speed = Fan.min_cooling_level * 10
for fan in fan_info_obj.get_presence_fans():
fan.set_speed(speed)
SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed)
UpdateCoolingLevelToMinCondition.enable = False
else:
UpdateCoolingLevelToMinCondition.enable = True

View File

@ -90,53 +90,3 @@ class AllPsuPresenceCondition(PsuCondition):
def is_match(self, thermal_info_dict): def is_match(self, thermal_info_dict):
psu_info_obj = self.get_psu_info(thermal_info_dict) psu_info_obj = self.get_psu_info(thermal_info_dict)
return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False
class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase):
trust_state = None
temperature = None
def is_match(self, thermal_info_dict):
from .thermal import Thermal
trust_state = Thermal.check_module_temperature_trustable()
temperature = Thermal.get_min_amb_temperature()
temperature = int(temperature / 1000)
change_cooling_level = False
if trust_state != MinCoolingLevelChangeCondition.trust_state:
MinCoolingLevelChangeCondition.trust_state = trust_state
change_cooling_level = True
if temperature != MinCoolingLevelChangeCondition.temperature:
MinCoolingLevelChangeCondition.temperature = temperature
change_cooling_level = True
return change_cooling_level
class CoolingLevelChangeCondition(ThermalPolicyConditionBase):
cooling_level = None
def is_match(self, thermal_info_dict):
from .fan import Fan
current_cooling_level = Fan.get_cooling_level()
if current_cooling_level != CoolingLevelChangeCondition.cooling_level:
CoolingLevelChangeCondition.cooling_level = current_cooling_level
return True
else:
return False
class UpdateCoolingLevelToMinCondition(ThermalPolicyConditionBase):
enable = False
def is_match(self, thermal_info_dict):
if not UpdateCoolingLevelToMinCondition.enable:
return False
from .fan import Fan
current_cooling_level = Fan.get_cooling_level()
if current_cooling_level == Fan.min_cooling_level:
UpdateCoolingLevelToMinCondition.enable = False
return False
return True

View File

@ -14,33 +14,22 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import os
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
from .thermal_actions import * from .thermal_actions import *
from .thermal_conditions import * from .thermal_conditions import *
from .thermal_infos import * from .thermal_infos import *
from .thermal import logger, MAX_COOLING_LEVEL, Thermal
class ThermalManager(ThermalManagerBase): class ThermalManager(ThermalManagerBase):
@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
cls._add_private_thermal_policy()
@classmethod @classmethod
def start_thermal_control_algorithm(cls): def start_thermal_control_algorithm(cls):
""" """
Start thermal control algorithm Start thermal control algorithm
Returns: Returns:
bool: True if set success, False if fail. bool: True if set success, False if fail.
""" """
from .thermal import Thermal
Thermal.set_thermal_algorithm_status(True) Thermal.set_thermal_algorithm_status(True)
@classmethod @classmethod
@ -49,24 +38,33 @@ class ThermalManager(ThermalManagerBase):
Stop thermal control algorithm Stop thermal control algorithm
Returns: Returns:
bool: True if set success, False if fail. bool: True if set success, False if fail.
""" """
from .thermal import Thermal
Thermal.set_thermal_algorithm_status(False) Thermal.set_thermal_algorithm_status(False)
@classmethod @classmethod
def _add_private_thermal_policy(cls): def run_policy(cls, chassis):
dynamic_min_speed_policy = ThermalPolicy() if not cls._policy_dict:
dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() return
dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction()
cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy
update_psu_fan_speed_policy = ThermalPolicy() try:
update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() cls._collect_thermal_information(chassis)
update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() except Exception as e:
cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy logger.log_error('Failed to collect thermal information {}'.format(repr(e)))
Thermal.set_expect_cooling_level(MAX_COOLING_LEVEL)
Thermal.commit_cooling_level(cls._thermal_info_dict)
return
update_cooling_level_policy = ThermalPolicy() for policy in cls._policy_dict.values():
update_cooling_level_policy.conditions[UpdateCoolingLevelToMinCondition] = UpdateCoolingLevelToMinCondition() if not cls._running:
update_cooling_level_policy.actions[UpdateCoolingLevelToMinAction] = UpdateCoolingLevelToMinAction() return
cls._policy_dict['UpdateCoolingLevelPolicy'] = update_cooling_level_policy try:
print(policy.name)
if policy.is_match(cls._thermal_info_dict):
policy.do_action(cls._thermal_info_dict)
except Exception as e:
logger.log_error('Failed to run thermal policy {} - {}'.format(policy.name, repr(e)))
# In case there is an exception, we put cooling level to max value
Thermal.set_expect_cooling_level(MAX_COOLING_LEVEL)
Thermal.commit_cooling_level(cls._thermal_info_dict)

View File

@ -42,3 +42,14 @@ def auto_recover_mock():
utils.read_str_from_file = origin_read_str_from_file utils.read_str_from_file = origin_read_str_from_file
utils.write_file = origin_write_file utils.write_file = origin_write_file
utils.read_float_from_file = origin_read_float_from_file utils.read_float_from_file = origin_read_float_from_file
@pytest.fixture(scope='function', autouse=True)
def auto_reset_cooling_level():
from sonic_platform.thermal import Thermal
yield
Thermal.expect_cooling_level = None
Thermal.expect_cooling_state = None
Thermal.last_set_cooling_level = None
Thermal.last_set_cooling_state = None
Thermal.last_set_psu_cooling_level = None

View File

@ -27,8 +27,7 @@
], ],
"actions": [ "actions": [
{ {
"type": "thermal_control.control", "type": "thermal.recover"
"status": "false"
}, },
{ {
"type": "fan.all.set_speed", "type": "fan.all.set_speed",
@ -45,8 +44,7 @@
], ],
"actions": [ "actions": [
{ {
"type": "thermal_control.control", "type": "thermal.recover"
"status": "false"
}, },
{ {
"type": "fan.all.set_speed", "type": "fan.all.set_speed",
@ -66,8 +64,7 @@
], ],
"actions": [ "actions": [
{ {
"type": "thermal_control.control", "type": "thermal.recover"
"status": "true"
} }
] ]
} }

View File

@ -18,14 +18,14 @@ import os
import pytest import pytest
import subprocess import subprocess
import sys import sys
from mock import call, MagicMock from mock import MagicMock, patch
test_path = os.path.dirname(os.path.abspath(__file__)) test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path) modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path) sys.path.insert(0, modules_path)
from sonic_platform import utils from sonic_platform import utils
from sonic_platform.fan import Fan, PsuFan, COOLING_STATE_PATH from sonic_platform.fan import Fan, PsuFan
from sonic_platform.fan_drawer import RealDrawer, VirtualDrawer from sonic_platform.fan_drawer import RealDrawer, VirtualDrawer
from sonic_platform.psu import Psu from sonic_platform.psu import Psu
@ -100,64 +100,31 @@ class TestFan:
fan.fan_drawer.get_presence = MagicMock(return_value=True) fan.fan_drawer.get_presence = MagicMock(return_value=True)
assert fan.get_presence() is True assert fan.get_presence() is True
def test_system_fan_set_speed(self): @patch('sonic_platform.utils.write_file')
def test_system_fan_set_speed(self, mock_write_file):
fan_drawer = RealDrawer(0) fan_drawer = RealDrawer(0)
fan = Fan(2, fan_drawer, 1) fan = Fan(2, fan_drawer, 1)
fan.min_cooling_level = 2
fan.set_cooling_level = MagicMock()
utils.write_file = MagicMock()
fan.set_speed(60) fan.set_speed(60)
fan.set_cooling_level.assert_called_with(6, 6) mock_write_file.assert_called_with(fan.fan_speed_set_path, 153, raise_exception=True)
utils.write_file.assert_called_with(fan.fan_speed_set_path, 153, raise_exception=True)
fan.min_cooling_level = 7 @patch('sonic_platform.thermal.Thermal.get_cooling_level')
fan.set_speed(60) @patch('sonic_platform.psu.Psu.get_presence')
fan.set_cooling_level.assert_called_with(7, 7) @patch('sonic_platform.psu.Psu.get_powergood_status')
utils.write_file.assert_called_with(fan.fan_speed_set_path, 178, raise_exception=True) @patch('os.path.exists')
def test_psu_fan_basic(self, mock_path_exists, mock_powergood, mock_presence, mock_cooling_level):
def test_set_cooling_level(self): mock_path_exists.return_value = False
with pytest.raises(RuntimeError):
Fan.set_cooling_level(11, 11)
utils.write_file = MagicMock()
Fan.set_cooling_level(10, 10)
calls = [call(COOLING_STATE_PATH, 20, raise_exception=True), call(COOLING_STATE_PATH, 10, raise_exception=True)]
utils.write_file.assert_has_calls(calls)
utils.write_file = MagicMock(side_effect=IOError(''))
with pytest.raises(RuntimeError):
Fan.set_cooling_level(10, 10)
utils.write_file = MagicMock(side_effect=ValueError(''))
with pytest.raises(RuntimeError):
Fan.set_cooling_level(10, 10)
def test_get_cooling_level(self):
utils.read_int_from_file = MagicMock()
Fan.get_cooling_level()
utils.read_int_from_file.assert_called_with(COOLING_STATE_PATH, raise_exception=True)
utils.read_int_from_file = MagicMock(side_effect=IOError(''))
with pytest.raises(RuntimeError):
Fan.get_cooling_level()
utils.read_int_from_file = MagicMock(side_effect=ValueError(''))
with pytest.raises(RuntimeError):
Fan.get_cooling_level()
def test_psu_fan_basic(self):
psu = Psu(0) psu = Psu(0)
fan = PsuFan(0, 1, psu) fan = PsuFan(0, 1, psu)
assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE
assert fan.get_status() is True assert fan.get_status() is True
assert fan.get_presence() is False assert fan.get_presence() is False
psu.get_presence = MagicMock(return_value=True) mock_presence.return_value = True
assert fan.get_presence() is False assert fan.get_presence() is False
psu.get_powergood_status = MagicMock(return_value=True) mock_powergood.return_value = True
assert fan.get_presence() is False assert fan.get_presence() is False
os.path.exists = MagicMock(return_value=True) mock_path_exists.return_value = True
assert fan.get_presence() is True assert fan.get_presence() is True
fan.get_cooling_level = MagicMock(return_value=7) mock_cooling_level.return_value = 7
assert fan.get_target_speed() == 70 assert fan.get_target_speed() == 70
def test_psu_fan_set_speed(self): def test_psu_fan_set_speed(self):

View File

@ -17,6 +17,7 @@
import glob import glob
import os import os
import pytest
import sys import sys
if sys.version_info.major == 3: if sys.version_info.major == 3:
from unittest import mock from unittest import mock
@ -32,12 +33,13 @@ from sonic_platform.device_data import DeviceDataManager
class TestThermal: class TestThermal:
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_gearbox_count', mock.MagicMock(return_value=2))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_cpu_thermal_count', mock.MagicMock(return_value=2))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name', mock.MagicMock(return_value='x86_64-mlnx_msn2700-r0'))
def test_chassis_thermal(self): def test_chassis_thermal(self):
from sonic_platform.thermal import THERMAL_NAMING_RULE from sonic_platform.thermal import THERMAL_NAMING_RULE
os.path.exists = mock.MagicMock(return_value=True) os.path.exists = mock.MagicMock(return_value=True)
DeviceDataManager.get_gearbox_count = mock.MagicMock(return_value=2)
DeviceDataManager.get_cpu_thermal_count = mock.MagicMock(return_value=2)
DeviceDataManager.get_platform_name = mock.MagicMock(return_value='x86_64-mlnx_msn2700-r0')
chassis = Chassis() chassis = Chassis()
thermal_list = chassis.get_all_thermals() thermal_list = chassis.get_all_thermals()
assert thermal_list assert thermal_list
@ -65,7 +67,7 @@ class TestThermal:
gearbox_thermal_rule = rule gearbox_thermal_rule = rule
elif 'CPU Core' in rule['name']: elif 'CPU Core' in rule['name']:
cpu_thermal_rule = rule cpu_thermal_rule = rule
gearbox_thermal_count = 0 gearbox_thermal_count = 0
cpu_thermal_count = 0 cpu_thermal_count = 0
for thermal in thermal_list: for thermal in thermal_list:
@ -85,7 +87,7 @@ class TestThermal:
assert cpu_thermal_rule['high_threshold'].format(start_index) in thermal.high_threshold assert cpu_thermal_rule['high_threshold'].format(start_index) in thermal.high_threshold
assert cpu_thermal_rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold assert cpu_thermal_rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
cpu_thermal_count += 1 cpu_thermal_count += 1
assert gearbox_thermal_count == 2 assert gearbox_thermal_count == 2
assert cpu_thermal_count == 2 assert cpu_thermal_count == 2
@ -151,7 +153,7 @@ class TestThermal:
utils.read_float_from_file = mock.MagicMock(return_value=0.0) utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_temperature() is None assert thermal.get_temperature() is None
utils.read_float_from_file = mock.MagicMock(return_value=None) utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None assert thermal.get_temperature() is None
@ -167,7 +169,7 @@ class TestThermal:
utils.read_float_from_file = mock.MagicMock(return_value=0.0) utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_temperature() is None assert thermal.get_temperature() is None
utils.read_float_from_file = mock.MagicMock(return_value=None) utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None assert thermal.get_temperature() is None
@ -183,7 +185,7 @@ class TestThermal:
utils.read_float_from_file = mock.MagicMock(return_value=0.0) utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_high_critical_threshold() is None assert thermal.get_high_critical_threshold() is None
utils.read_float_from_file = mock.MagicMock(return_value=None) utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_high_critical_threshold() is None assert thermal.get_high_critical_threshold() is None
@ -197,7 +199,7 @@ class TestThermal:
for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD): for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'step_wise') utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'step_wise')
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_MODE_FILE), 'enabled') utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_MODE_FILE), 'enabled')
assert Thermal.set_thermal_algorithm_status(False, False) assert Thermal.set_thermal_algorithm_status(False, False)
for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD): for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'user_space') utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'user_space')
@ -207,27 +209,35 @@ class TestThermal:
assert Thermal.set_thermal_algorithm_status(False) assert Thermal.set_thermal_algorithm_status(False)
def test_check_thermal_zone_temperature(self): @mock.patch('glob.iglob', mock.MagicMock(return_value=['thermal_zone1', 'thermal_zone2']))
from sonic_platform.thermal import Thermal, THERMAL_ZONE_FOLDER_WILDCARD, THERMAL_ZONE_THRESHOLD_FILE, THERMAL_ZONE_TEMP_FILE @mock.patch('sonic_platform.utils.read_int_from_file')
from sonic_platform import utils def test_get_min_allowed_cooling_level_by_thermal_zone(self, mock_read_file):
glob.iglob = mock.MagicMock(return_value=['thermal_zone1', 'thermal_zone2']) from sonic_platform.thermal import Thermal, THERMAL_ZONE_TEMP_FILE, THERMAL_ZONE_HIGH_THRESHOLD, THERMAL_ZONE_NORMAL_THRESHOLD, MIN_COOLING_LEVEL_FOR_HIGH, MIN_COOLING_LEVEL_FOR_NORMAL
mock_read_file.side_effect = Exception('')
utils.read_int_from_file = mock.MagicMock(side_effect=Exception('')) assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() is None
assert not Thermal.check_thermal_zone_temperature()
mock_file_content = {} mock_file_content = {}
def mock_read_int_from_file(file_path, default=0, raise_exception=False): def mock_read_int_from_file(file_path, default=0, raise_exception=False):
return mock_file_content[file_path] return mock_file_content[file_path]
utils.read_int_from_file = mock_read_int_from_file mock_read_file.side_effect = mock_read_int_from_file
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_THRESHOLD_FILE)] = 25 mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_NORMAL_THRESHOLD)] = 75000
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 30 mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_HIGH_THRESHOLD)] = 85000
mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_THRESHOLD_FILE)] = 25 mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 69000
mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_TEMP_FILE)] = 24 mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_NORMAL_THRESHOLD)] = 75000
assert not Thermal.check_thermal_zone_temperature() mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_HIGH_THRESHOLD)] = 85000
mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_TEMP_FILE)] = 24000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_NORMAL
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 71000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_HIGH
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 79000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_HIGH
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 81000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() is None
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 24
assert Thermal.check_thermal_zone_temperature()
def test_check_module_temperature_trustable(self): def test_check_module_temperature_trustable(self):
from sonic_platform.thermal import Thermal from sonic_platform.thermal import Thermal
@ -255,3 +265,47 @@ class TestThermal:
mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'fan_amb')] = 50 mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'fan_amb')] = 50
mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'port_amb')] = 40 mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'port_amb')] = 40
assert Thermal.get_min_amb_temperature() == 40 assert Thermal.get_min_amb_temperature() == 40
@mock.patch('sonic_platform.utils.write_file')
def test_set_cooling_level(self, mock_write_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.set_cooling_level(10)
calls = [mock.call(COOLING_STATE_PATH, 20, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
pre_call_count = mock_write_file.call_count
Thermal.set_cooling_level(10)
assert pre_call_count == mock_write_file.call_count
Thermal.set_cooling_level(9)
calls = [mock.call(COOLING_STATE_PATH, 19, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
@mock.patch('sonic_platform.utils.write_file')
def test_set_cooling_state(self, mock_write_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.set_cooling_state(10)
calls = [mock.call(COOLING_STATE_PATH, 10, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
pre_call_count = mock_write_file.call_count
Thermal.set_cooling_state(10)
assert pre_call_count == mock_write_file.call_count
Thermal.set_cooling_state(9)
calls = [mock.call(COOLING_STATE_PATH, 9, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_get_cooling_level(self, mock_read_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.get_cooling_level()
mock_read_file.assert_called_with(COOLING_STATE_PATH, raise_exception=True)
mock_read_file.side_effect = IOError('')
with pytest.raises(RuntimeError):
Thermal.get_cooling_level()
mock_read_file.side_effect = ValueError('')
with pytest.raises(RuntimeError):
Thermal.get_cooling_level()

View File

@ -18,7 +18,7 @@ import os
import sys import sys
import pytest import pytest
import json import json
from mock import MagicMock from mock import MagicMock, patch
from .mock_platform import MockChassis, MockFan, MockFanDrawer, MockPsu from .mock_platform import MockChassis, MockFan, MockFanDrawer, MockPsu
test_path = os.path.dirname(os.path.abspath(__file__)) test_path = os.path.dirname(os.path.abspath(__file__))
@ -27,24 +27,10 @@ sys.path.insert(0, modules_path)
from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_manager import ThermalManager
from sonic_platform.thermal_infos import FanInfo, PsuInfo from sonic_platform.thermal_infos import FanInfo, PsuInfo
from sonic_platform.fan import Fan from sonic_platform.thermal import Thermal, MAX_COOLING_LEVEL
from sonic_platform.thermal import Thermal
from sonic_platform.device_data import DeviceDataManager from sonic_platform.device_data import DeviceDataManager
@pytest.fixture(scope='module', autouse=True)
def configure_mocks():
check_thermal_zone_temperature = Thermal.check_thermal_zone_temperature
set_thermal_algorithm_status = Thermal.set_thermal_algorithm_status
Thermal.check_thermal_zone_temperature = MagicMock()
Thermal.set_thermal_algorithm_status = MagicMock()
yield
Thermal.check_thermal_zone_temperature = check_thermal_zone_temperature
Thermal.set_thermal_algorithm_status = set_thermal_algorithm_status
@pytest.fixture(scope='session', autouse=True) @pytest.fixture(scope='session', autouse=True)
def thermal_manager(): def thermal_manager():
policy_file = os.path.join(test_path, 'thermal_policy.json') policy_file = os.path.join(test_path, 'thermal_policy.json')
@ -113,51 +99,60 @@ def test_psu_info():
assert not psu_info.is_status_changed() assert not psu_info.is_status_changed()
def test_fan_policy(thermal_manager): @patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6))
@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2))
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
@patch('sonic_platform.thermal.Thermal.set_cooling_level')
def test_fan_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager):
print('In test_fan_policy')
from sonic_platform.thermal import MIN_COOLING_LEVEL_FOR_NORMAL
chassis = MockChassis() chassis = MockChassis()
chassis.make_fan_absence() chassis.make_fan_absence()
chassis.get_all_fan_drawers()[0].get_all_fans().append(MockFan()) chassis.get_all_fan_drawers()[0].get_all_fans().append(MockFan())
chassis.platform_name = 'some_platform'
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
Thermal.expect_cooling_level = None
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans() fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
assert fan_list[1].speed == 100
Thermal.set_thermal_algorithm_status.assert_called_with(False, False)
fan_list[0].presence = True fan_list[0].presence = True
Thermal.check_thermal_zone_temperature = MagicMock(return_value=True)
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
Thermal.set_thermal_algorithm_status.assert_called_with(True, False) mock_set_cooling_level.assert_called_with(6)
assert Thermal.check_thermal_zone_temperature.call_count == 2 mock_set_cooling_state.assert_called_with(6)
assert fan_list[0].speed == 60
assert fan_list[1].speed == 60
Thermal.expect_cooling_level = None
fan_list[0].status = False fan_list[0].status = False
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
Thermal.set_thermal_algorithm_status.assert_called_with(False, False) mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
Thermal.expect_cooling_level = None
fan_list[0].status = True fan_list[0].status = True
Thermal.check_thermal_zone_temperature = MagicMock(return_value=False)
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
Thermal.set_thermal_algorithm_status.assert_called_with(True, False) mock_set_cooling_level.assert_called_with(6)
assert Thermal.check_thermal_zone_temperature.call_count == 2 mock_set_cooling_state.assert_called_with(6)
assert fan_list[0].speed == 100
assert fan_list[1].speed == 100
def test_psu_policy(thermal_manager): @patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2))
@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6))
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
@patch('sonic_platform.thermal.Thermal.set_cooling_level')
def test_psu_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager):
chassis = MockChassis() chassis = MockChassis()
chassis.make_psu_absence() chassis.make_psu_absence()
chassis.fan_list.append(MockFan()) chassis.platform_name = 'some_platform'
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
fan_list = chassis.get_all_fans() mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
assert fan_list[0].speed == 100
Thermal.set_thermal_algorithm_status.assert_called_with(False, False)
psu_list = chassis.get_all_psus() psu_list = chassis.get_all_psus()
psu_list[0].presence = True psu_list[0].presence = True
thermal_manager.run_policy(chassis) thermal_manager.run_policy(chassis)
Thermal.set_thermal_algorithm_status.assert_called_with(True, False) mock_set_cooling_level.assert_called_with(6)
mock_set_cooling_state.assert_called_with(6)
def test_any_fan_absence_condition(): def test_any_fan_absence_condition():
@ -328,6 +323,7 @@ def test_load_set_fan_speed_action():
action.load_from_json(json_obj) action.load_from_json(json_obj)
@patch('sonic_platform.thermal.Thermal.set_cooling_level', MagicMock())
def test_execute_set_fan_speed_action(): def test_execute_set_fan_speed_action():
chassis = MockChassis() chassis = MockChassis()
chassis.get_all_fan_drawers().append(MockFanDrawer()) chassis.get_all_fan_drawers().append(MockFanDrawer())
@ -337,85 +333,14 @@ def test_execute_set_fan_speed_action():
fan_info = FanInfo() fan_info = FanInfo()
fan_info.collect(chassis) fan_info.collect(chassis)
Thermal.expect_cooling_level = None
from sonic_platform.thermal_actions import SetAllFanSpeedAction from sonic_platform.thermal_actions import SetAllFanSpeedAction
action = SetAllFanSpeedAction() action = SetAllFanSpeedAction()
action.speed = 99 action.speed = 20
action.execute({'fan_info': fan_info}) action.execute({'fan_info': fan_info})
assert fan_list[0].speed == 99 assert Thermal.expect_cooling_level == 2
assert fan_list[1].speed == 99
def test_load_control_thermal_algo_action():
from sonic_platform.thermal_actions import ControlThermalAlgoAction
action = ControlThermalAlgoAction()
json_str = '{\"status\": \"false\"}'
json_obj = json.loads(json_str)
action.load_from_json(json_obj)
assert not action.status
json_str = '{\"status\": \"true\"}'
json_obj = json.loads(json_str)
action.load_from_json(json_obj)
assert action.status
json_str = '{\"status\": \"invalid\"}'
json_obj = json.loads(json_str)
with pytest.raises(ValueError):
action.load_from_json(json_obj)
json_str = '{\"invalid\": \"true\"}'
json_obj = json.loads(json_str)
with pytest.raises(ValueError):
action.load_from_json(json_obj)
def test_load_check_and_set_speed_action():
from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction
action = CheckAndSetAllFanSpeedAction()
json_str = '{\"speed\": \"40\"}'
json_obj = json.loads(json_str)
action.load_from_json(json_obj)
assert action.speed == 40
json_str = '{\"speed\": \"-1\"}'
json_obj = json.loads(json_str)
with pytest.raises(ValueError):
action.load_from_json(json_obj)
json_str = '{\"speed\": \"101\"}'
json_obj = json.loads(json_str)
with pytest.raises(ValueError):
action.load_from_json(json_obj)
json_str = '{\"invalid\": \"60\"}'
json_obj = json.loads(json_str)
with pytest.raises(ValueError):
action.load_from_json(json_obj)
def test_execute_check_and_set_fan_speed_action():
chassis = MockChassis()
chassis.get_all_fan_drawers().append(MockFanDrawer())
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
fan_list.append(MockFan())
fan_list.append(MockFan())
fan_info = FanInfo()
fan_info.collect(chassis)
Thermal.check_thermal_zone_temperature = MagicMock(return_value=True)
from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction
action = CheckAndSetAllFanSpeedAction()
action.speed = 99
action.execute({'fan_info': fan_info})
assert fan_list[0].speed == 99
assert fan_list[1].speed == 99
Thermal.check_thermal_zone_temperature = MagicMock(return_value=False)
fan_list[0].speed = 100
fan_list[1].speed = 100
action.speed = 60
action.execute({'fan_info': fan_info})
assert fan_list[0].speed == 100
assert fan_list[1].speed == 100
def test_load_duplicate_condition(): def test_load_duplicate_condition():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'duplicate_condition.json')) as f: with open(os.path.join(test_path, 'duplicate_condition.json')) as f:
@ -497,54 +422,89 @@ def check_minimum_table_data(platform, minimum_table):
assert cooling_level > previous_cooling_level assert cooling_level > previous_cooling_level
previous_cooling_level = cooling_level previous_cooling_level = cooling_level
def test_dynamic_minimum_policy(thermal_manager): @patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition @patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction @patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone')
from sonic_platform.thermal_infos import ChassisInfo, FanInfo @patch('sonic_platform.thermal.Thermal.get_min_amb_temperature')
from sonic_platform.thermal import Thermal @patch('sonic_platform.thermal.Thermal.check_module_temperature_trustable')
from sonic_platform.fan import Fan def test_thermal_recover_policy(mock_check_trustable, mock_get_min_amb, moc_get_min_allowed, mock_platform_name):
ThermalManager.initialize() from sonic_platform.thermal_infos import ChassisInfo
assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict from sonic_platform.thermal_actions import ThermalRecoverAction
policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy']
assert MinCoolingLevelChangeCondition in policy.conditions
assert ChangeMinCoolingLevelAction in policy.actions
condition = policy.conditions[MinCoolingLevelChangeCondition]
action = policy.actions[ChangeMinCoolingLevelAction]
Thermal.check_module_temperature_trustable = MagicMock(return_value='trust')
Thermal.get_min_amb_temperature = MagicMock(return_value=35001)
assert condition.is_match(None)
assert MinCoolingLevelChangeCondition.trust_state == 'trust'
assert MinCoolingLevelChangeCondition.temperature == 35
assert not condition.is_match(None)
Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust')
assert condition.is_match(None)
assert MinCoolingLevelChangeCondition.trust_state == 'untrust'
Thermal.get_min_amb_temperature = MagicMock(return_value=25999)
assert condition.is_match(None)
assert MinCoolingLevelChangeCondition.temperature == 25
chassis = MockChassis() chassis = MockChassis()
mock_platform_name.return_value = 'invalid'
info = ChassisInfo() info = ChassisInfo()
info._chassis = chassis info._chassis = chassis
fan_info = FanInfo() thermal_info_dict = {ChassisInfo.INFO_NAME: info}
thermal_info_dict = { Thermal.expect_cooling_level = None
ChassisInfo.INFO_NAME: info, action = ThermalRecoverAction()
FanInfo.INFO_NAME: fan_info moc_get_min_allowed.return_value = 2
}
DeviceDataManager.get_platform_name = MagicMock(return_value=None)
Fan.get_cooling_level = MagicMock(return_value=5)
Fan.set_cooling_level = MagicMock()
action.execute(thermal_info_dict) action.execute(thermal_info_dict)
assert Fan.min_cooling_level == 6 assert Thermal.expect_cooling_level == 6
Fan.set_cooling_level.assert_called_with(6, 6) Thermal.last_set_cooling_level = Thermal.expect_cooling_level
Fan.set_cooling_level.call_count = 0
DeviceDataManager.get_platform_name = MagicMock(return_value='x86_64-mlnx_msn2700-r0') Thermal.expect_cooling_level = None
print('Before execute') mock_platform_name.return_value = 'x86_64-mlnx_msn2700-r0'
mock_check_trustable.return_value = 'trust'
mock_get_min_amb.return_value = 29999
moc_get_min_allowed.return_value = None
action.execute(thermal_info_dict) action.execute(thermal_info_dict)
assert Fan.min_cooling_level == 3 assert Thermal.expect_cooling_level is None
Fan.set_cooling_level.assert_called_with(3, 5)
moc_get_min_allowed.return_value = 4
action.execute(thermal_info_dict)
assert Thermal.expect_cooling_level == 4
Thermal.last_set_cooling_level = Thermal.expect_cooling_level
mock_check_trustable.return_value = 'untrust'
mock_get_min_amb.return_value = 31001
action.execute(thermal_info_dict)
assert Thermal.expect_cooling_level == 5
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
@patch('sonic_platform.utils.read_int_from_file')
def test_monitor_asic_themal_zone(mock_read_int, mock_set_cooling_state):
mock_read_int.side_effect = [111000, 105000]
Thermal.monitor_asic_themal_zone()
assert Thermal.expect_cooling_state == MAX_COOLING_LEVEL
Thermal.commit_cooling_level({})
mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
mock_read_int.reset()
mock_read_int.side_effect = [104000, 105000]
Thermal.monitor_asic_themal_zone()
assert Thermal.expect_cooling_state is None
def test_set_expect_cooling_level():
Thermal.set_expect_cooling_level(5)
assert Thermal.expect_cooling_level == 5
Thermal.set_expect_cooling_level(3)
assert Thermal.expect_cooling_level == 5
Thermal.set_expect_cooling_level(10)
assert Thermal.expect_cooling_level == 10
@patch('sonic_platform.thermal.Thermal.commit_cooling_level', MagicMock())
@patch('sonic_platform.thermal_conditions.AnyFanFaultCondition.is_match')
@patch('sonic_platform.thermal_manager.ThermalManager._collect_thermal_information')
@patch('sonic_platform.thermal.Thermal.set_expect_cooling_level')
def test_run_policy(mock_expect, mock_collect_info, mock_match, thermal_manager):
chassis = MockChassis()
mock_collect_info.side_effect = Exception('')
thermal_manager.run_policy(chassis)
mock_expect.assert_called_with(MAX_COOLING_LEVEL)
mock_collect_info.side_effect = None
mock_expect.reset_mock()
mock_match.side_effect = Exception('')
thermal_manager.run_policy(chassis)
mock_expect.assert_called_with(MAX_COOLING_LEVEL)
thermal_manager.stop()
mock_expect.reset_mock()
thermal_manager.run_policy(chassis)
assert mock_expect.call_count == 0

View File

@ -23,10 +23,6 @@
} }
], ],
"actions": [ "actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{ {
"type": "fan.all.set_speed", "type": "fan.all.set_speed",
"speed": "100" "speed": "100"
@ -41,10 +37,6 @@
} }
], ],
"actions": [ "actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{ {
"type": "fan.all.set_speed", "type": "fan.all.set_speed",
"speed": "100" "speed": "100"
@ -59,10 +51,6 @@
} }
], ],
"actions": [ "actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{ {
"type": "fan.all.set_speed", "type": "fan.all.set_speed",
"speed": "100" "speed": "100"
@ -84,12 +72,8 @@
], ],
"actions": [ "actions": [
{ {
"type": "thermal_control.control", "type": "thermal.recover",
"status": "true" "status": "true"
},
{
"type": "fan.all.check_and_set_speed",
"speed": "60"
} }
] ]
} }