4ae504a813
- Why I did it Optimize thermal control policies to simplify the logic and add more protection code in policies to make sure it works even if kernel algorithm does not work. - How I did it Reduce unused thermal policies Add timely ASIC temperature check in thermal policy to make sure ASIC temperature and fan speed is coordinated Minimum allowed fan speed now is calculated by max of the expected fan speed among all policies Move some logic from fan.py to thermal.py to make it more readable - How to verify it 1. Manual test 2. Regression
511 lines
18 KiB
Python
511 lines
18 KiB
Python
#
|
|
# Copyright (c) 2020-2021 NVIDIA CORPORATION & AFFILIATES.
|
|
# Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import os
|
|
import sys
|
|
import pytest
|
|
import json
|
|
from mock import MagicMock, patch
|
|
from .mock_platform import MockChassis, MockFan, MockFanDrawer, MockPsu
|
|
|
|
test_path = os.path.dirname(os.path.abspath(__file__))
|
|
modules_path = os.path.dirname(test_path)
|
|
sys.path.insert(0, modules_path)
|
|
|
|
from sonic_platform.thermal_manager import ThermalManager
|
|
from sonic_platform.thermal_infos import FanInfo, PsuInfo
|
|
from sonic_platform.thermal import Thermal, MAX_COOLING_LEVEL
|
|
from sonic_platform.device_data import DeviceDataManager
|
|
|
|
|
|
@pytest.fixture(scope='session', autouse=True)
|
|
def thermal_manager():
|
|
policy_file = os.path.join(test_path, 'thermal_policy.json')
|
|
ThermalManager.load(policy_file)
|
|
return ThermalManager
|
|
|
|
|
|
def test_load_policy(thermal_manager):
|
|
assert 'psu_info' in thermal_manager._thermal_info_dict
|
|
assert 'fan_info' in thermal_manager._thermal_info_dict
|
|
assert 'chassis_info' in thermal_manager._thermal_info_dict
|
|
|
|
assert 'any fan absence' in thermal_manager._policy_dict
|
|
assert 'any psu absence' in thermal_manager._policy_dict
|
|
assert 'any fan broken' in thermal_manager._policy_dict
|
|
assert 'all fan and psu presence' in thermal_manager._policy_dict
|
|
|
|
assert thermal_manager._fan_speed_when_suspend == 60
|
|
assert thermal_manager._run_thermal_algorithm_at_boot_up == False
|
|
|
|
|
|
def test_fan_info():
|
|
chassis = MockChassis()
|
|
chassis.make_fan_absence()
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
assert len(fan_info.get_absence_fans()) == 1
|
|
assert len(fan_info.get_presence_fans()) == 0
|
|
assert len(fan_info.get_fault_fans()) == 0
|
|
assert fan_info.is_status_changed()
|
|
|
|
chassis.get_all_fan_drawers()[0].get_all_fans()[0].presence = True
|
|
fan_info.collect(chassis)
|
|
assert len(fan_info.get_absence_fans()) == 0
|
|
assert len(fan_info.get_presence_fans()) == 1
|
|
assert len(fan_info.get_fault_fans()) == 0
|
|
assert fan_info.is_status_changed()
|
|
|
|
chassis.get_all_fan_drawers()[0].get_all_fans()[0].status = False
|
|
fan_info.collect(chassis)
|
|
assert len(fan_info.get_absence_fans()) == 0
|
|
assert len(fan_info.get_presence_fans()) == 1
|
|
assert len(fan_info.get_fault_fans()) == 1
|
|
assert fan_info.is_status_changed()
|
|
|
|
def test_psu_info():
|
|
chassis = MockChassis()
|
|
chassis.make_psu_absence()
|
|
psu_info = PsuInfo()
|
|
psu_info.collect(chassis)
|
|
assert len(psu_info.get_absence_psus()) == 1
|
|
assert len(psu_info.get_presence_psus()) == 0
|
|
assert psu_info.is_status_changed()
|
|
|
|
psu_list = chassis.get_all_psus()
|
|
psu_list[0].presence = True
|
|
psu_info.collect(chassis)
|
|
assert len(psu_info.get_absence_psus()) == 0
|
|
assert len(psu_info.get_presence_psus()) == 1
|
|
assert psu_info.is_status_changed()
|
|
|
|
psu_list[0].powergood = False
|
|
psu_info.collect(chassis)
|
|
assert len(psu_info.get_absence_psus()) == 0
|
|
assert len(psu_info.get_presence_psus()) == 1
|
|
assert not psu_info.is_status_changed()
|
|
|
|
|
|
@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
|
|
@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6))
|
|
@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2))
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_level')
|
|
def test_fan_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager):
|
|
print('In test_fan_policy')
|
|
from sonic_platform.thermal import MIN_COOLING_LEVEL_FOR_NORMAL
|
|
chassis = MockChassis()
|
|
chassis.make_fan_absence()
|
|
chassis.get_all_fan_drawers()[0].get_all_fans().append(MockFan())
|
|
chassis.platform_name = 'some_platform'
|
|
thermal_manager.run_policy(chassis)
|
|
|
|
mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
|
|
mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
|
|
|
|
Thermal.expect_cooling_level = None
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list[0].presence = True
|
|
thermal_manager.run_policy(chassis)
|
|
mock_set_cooling_level.assert_called_with(6)
|
|
mock_set_cooling_state.assert_called_with(6)
|
|
|
|
Thermal.expect_cooling_level = None
|
|
fan_list[0].status = False
|
|
thermal_manager.run_policy(chassis)
|
|
mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
|
|
|
|
Thermal.expect_cooling_level = None
|
|
fan_list[0].status = True
|
|
thermal_manager.run_policy(chassis)
|
|
mock_set_cooling_level.assert_called_with(6)
|
|
mock_set_cooling_state.assert_called_with(6)
|
|
|
|
|
|
@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
|
|
@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone', MagicMock(return_value=2))
|
|
@patch('sonic_platform.thermal.Thermal.get_cooling_level', MagicMock(return_value=6))
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_level')
|
|
def test_psu_policy(mock_set_cooling_level, mock_set_cooling_state, thermal_manager):
|
|
chassis = MockChassis()
|
|
chassis.make_psu_absence()
|
|
chassis.platform_name = 'some_platform'
|
|
thermal_manager.run_policy(chassis)
|
|
mock_set_cooling_level.assert_called_with(MAX_COOLING_LEVEL)
|
|
mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
|
|
|
|
psu_list = chassis.get_all_psus()
|
|
psu_list[0].presence = True
|
|
thermal_manager.run_policy(chassis)
|
|
mock_set_cooling_level.assert_called_with(6)
|
|
mock_set_cooling_state.assert_called_with(6)
|
|
|
|
|
|
def test_any_fan_absence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_fan_absence()
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AnyFanAbsenceCondition
|
|
condition = AnyFanAbsenceCondition()
|
|
assert condition.is_match({'fan_info': fan_info})
|
|
|
|
fan = chassis.get_all_fan_drawers()[0].get_all_fans()[0]
|
|
fan.presence = True
|
|
fan_info.collect(chassis)
|
|
assert not condition.is_match({'fan_info': fan_info})
|
|
|
|
|
|
def test_all_fan_absence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_fan_absence()
|
|
fan = MockFan()
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list.append(fan)
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AllFanAbsenceCondition
|
|
condition = AllFanAbsenceCondition()
|
|
assert not condition.is_match({'fan_info': fan_info})
|
|
|
|
fan.presence = False
|
|
fan_info.collect(chassis)
|
|
assert condition.is_match({'fan_info': fan_info})
|
|
|
|
|
|
def test_all_fan_presence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_fan_absence()
|
|
fan = MockFan()
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list.append(fan)
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AllFanPresenceCondition
|
|
condition = AllFanPresenceCondition()
|
|
assert not condition.is_match({'fan_info': fan_info})
|
|
|
|
fan_list[0].presence = True
|
|
fan_info.collect(chassis)
|
|
assert condition.is_match({'fan_info': fan_info})
|
|
|
|
def test_any_fan_fault_condition():
|
|
chassis = MockChassis()
|
|
chassis.get_all_fan_drawers().append(MockFanDrawer())
|
|
fan = MockFan()
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list.append(fan)
|
|
fault_fan = MockFan()
|
|
fault_fan.status = False
|
|
fan_list.append(fault_fan)
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AnyFanFaultCondition
|
|
condition = AnyFanFaultCondition()
|
|
assert condition.is_match({'fan_info': fan_info})
|
|
|
|
fault_fan.status = True
|
|
fan_info.collect(chassis)
|
|
assert not condition.is_match({'fan_info': fan_info})
|
|
|
|
def test_all_fan_good_condition():
|
|
chassis = MockChassis()
|
|
chassis.get_all_fan_drawers().append(MockFanDrawer())
|
|
fan = MockFan()
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list.append(fan)
|
|
fault_fan = MockFan()
|
|
fault_fan.status = False
|
|
fan_list.append(fault_fan)
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AllFanGoodCondition
|
|
condition = AllFanGoodCondition()
|
|
assert not condition.is_match({'fan_info': fan_info})
|
|
|
|
fault_fan.status = True
|
|
fan_info.collect(chassis)
|
|
assert condition.is_match({'fan_info': fan_info})
|
|
|
|
|
|
def test_any_psu_absence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_psu_absence()
|
|
psu_info = PsuInfo()
|
|
psu_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AnyPsuAbsenceCondition
|
|
condition = AnyPsuAbsenceCondition()
|
|
assert condition.is_match({'psu_info': psu_info})
|
|
|
|
psu = chassis.get_all_psus()[0]
|
|
psu.presence = True
|
|
psu_info.collect(chassis)
|
|
assert not condition.is_match({'psu_info': psu_info})
|
|
|
|
|
|
def test_all_psu_absence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_psu_absence()
|
|
psu = MockPsu()
|
|
psu_list = chassis.get_all_psus()
|
|
psu_list.append(psu)
|
|
psu_info = PsuInfo()
|
|
psu_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AllPsuAbsenceCondition
|
|
condition = AllPsuAbsenceCondition()
|
|
assert not condition.is_match({'psu_info': psu_info})
|
|
|
|
psu.presence = False
|
|
psu_info.collect(chassis)
|
|
assert condition.is_match({'psu_info': psu_info})
|
|
|
|
|
|
def test_all_fan_presence_condition():
|
|
chassis = MockChassis()
|
|
chassis.make_psu_absence()
|
|
psu = MockPsu()
|
|
psu_list = chassis.get_all_psus()
|
|
psu_list.append(psu)
|
|
psu_info = PsuInfo()
|
|
psu_info.collect(chassis)
|
|
|
|
from sonic_platform.thermal_conditions import AllPsuPresenceCondition
|
|
condition = AllPsuPresenceCondition()
|
|
assert not condition.is_match({'psu_info': psu_info})
|
|
|
|
psu_list[0].presence = True
|
|
psu_info.collect(chassis)
|
|
assert condition.is_match({'psu_info': psu_info})
|
|
|
|
|
|
def test_load_set_fan_speed_action():
|
|
from sonic_platform.thermal_actions import SetAllFanSpeedAction
|
|
action = SetAllFanSpeedAction()
|
|
json_str = '{\"speed\": \"50\"}'
|
|
json_obj = json.loads(json_str)
|
|
action.load_from_json(json_obj)
|
|
assert action.speed == 50
|
|
|
|
json_str = '{\"speed\": \"-1\"}'
|
|
json_obj = json.loads(json_str)
|
|
with pytest.raises(ValueError):
|
|
action.load_from_json(json_obj)
|
|
|
|
json_str = '{\"speed\": \"101\"}'
|
|
json_obj = json.loads(json_str)
|
|
with pytest.raises(ValueError):
|
|
action.load_from_json(json_obj)
|
|
|
|
json_str = '{\"invalid\": \"101\"}'
|
|
json_obj = json.loads(json_str)
|
|
with pytest.raises(ValueError):
|
|
action.load_from_json(json_obj)
|
|
|
|
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_level', MagicMock())
|
|
def test_execute_set_fan_speed_action():
|
|
chassis = MockChassis()
|
|
chassis.get_all_fan_drawers().append(MockFanDrawer())
|
|
fan_list = chassis.get_all_fan_drawers()[0].get_all_fans()
|
|
fan_list.append(MockFan())
|
|
fan_list.append(MockFan())
|
|
fan_info = FanInfo()
|
|
fan_info.collect(chassis)
|
|
|
|
Thermal.expect_cooling_level = None
|
|
from sonic_platform.thermal_actions import SetAllFanSpeedAction
|
|
action = SetAllFanSpeedAction()
|
|
action.speed = 20
|
|
action.execute({'fan_info': fan_info})
|
|
assert Thermal.expect_cooling_level == 2
|
|
|
|
|
|
def test_load_duplicate_condition():
|
|
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
|
with open(os.path.join(test_path, 'duplicate_condition.json')) as f:
|
|
json_obj = json.load(f)
|
|
policy = ThermalPolicy()
|
|
with pytest.raises(Exception):
|
|
policy.load_from_json(json_obj)
|
|
|
|
def test_load_duplicate_action():
|
|
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
|
with open(os.path.join(test_path, 'duplicate_action.json')) as f:
|
|
json_obj = json.load(f)
|
|
policy = ThermalPolicy()
|
|
with pytest.raises(Exception):
|
|
policy.load_from_json(json_obj)
|
|
|
|
def test_load_empty_condition():
|
|
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
|
with open(os.path.join(test_path, 'empty_condition.json')) as f:
|
|
json_obj = json.load(f)
|
|
policy = ThermalPolicy()
|
|
with pytest.raises(Exception):
|
|
policy.load_from_json(json_obj)
|
|
|
|
def test_load_empty_action():
|
|
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
|
|
with open(os.path.join(test_path, 'empty_action.json')) as f:
|
|
json_obj = json.load(f)
|
|
policy = ThermalPolicy()
|
|
with pytest.raises(Exception):
|
|
policy.load_from_json(json_obj)
|
|
|
|
def test_load_policy_with_same_conditions():
|
|
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
|
|
class MockThermalManager(ThermalManagerBase):
|
|
pass
|
|
|
|
with pytest.raises(Exception):
|
|
MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json'))
|
|
|
|
def test_dynamic_minimum_table_data():
|
|
from sonic_platform.device_data import DEVICE_DATA
|
|
for platform, platform_data in DEVICE_DATA.items():
|
|
if 'thermal' in platform_data and 'minimum_table' in platform_data['thermal']:
|
|
minimum_table = platform_data['thermal']['minimum_table']
|
|
check_minimum_table_data(platform, minimum_table)
|
|
|
|
def check_minimum_table_data(platform, minimum_table):
|
|
valid_dir = ['p2c', 'c2p', 'unk']
|
|
valid_trust_state = ['trust', 'untrust']
|
|
|
|
for category, data in minimum_table.items():
|
|
key_data = category.split('_')
|
|
assert key_data[0] in valid_dir
|
|
assert key_data[1] in valid_trust_state
|
|
|
|
data_list = [(value, key) for key, value in data.items()]
|
|
data_list.sort(key=lambda x : x[0])
|
|
|
|
previous_edge = None
|
|
previous_cooling_level = None
|
|
for item in data_list:
|
|
cooling_level = item[0]
|
|
range_str = item[1]
|
|
|
|
ranges = range_str.split(':')
|
|
low = int(ranges[0])
|
|
high = int(ranges[1])
|
|
assert low < high
|
|
|
|
if previous_edge is None:
|
|
assert low == -127
|
|
else:
|
|
assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(platform, key_data[0], key_data[1], item)
|
|
previous_edge = high
|
|
|
|
assert 10 <= cooling_level <= 20
|
|
if previous_cooling_level is not None:
|
|
assert cooling_level > previous_cooling_level
|
|
previous_cooling_level = cooling_level
|
|
|
|
@patch('sonic_platform.thermal.Thermal.monitor_asic_themal_zone', MagicMock())
|
|
@patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
|
|
@patch('sonic_platform.thermal.Thermal.get_min_allowed_cooling_level_by_thermal_zone')
|
|
@patch('sonic_platform.thermal.Thermal.get_min_amb_temperature')
|
|
@patch('sonic_platform.thermal.Thermal.check_module_temperature_trustable')
|
|
def test_thermal_recover_policy(mock_check_trustable, mock_get_min_amb, moc_get_min_allowed, mock_platform_name):
|
|
from sonic_platform.thermal_infos import ChassisInfo
|
|
from sonic_platform.thermal_actions import ThermalRecoverAction
|
|
chassis = MockChassis()
|
|
mock_platform_name.return_value = 'invalid'
|
|
info = ChassisInfo()
|
|
info._chassis = chassis
|
|
thermal_info_dict = {ChassisInfo.INFO_NAME: info}
|
|
|
|
Thermal.expect_cooling_level = None
|
|
action = ThermalRecoverAction()
|
|
moc_get_min_allowed.return_value = 2
|
|
action.execute(thermal_info_dict)
|
|
assert Thermal.expect_cooling_level == 6
|
|
Thermal.last_set_cooling_level = Thermal.expect_cooling_level
|
|
|
|
Thermal.expect_cooling_level = None
|
|
mock_platform_name.return_value = 'x86_64-mlnx_msn2700-r0'
|
|
mock_check_trustable.return_value = 'trust'
|
|
mock_get_min_amb.return_value = 29999
|
|
moc_get_min_allowed.return_value = None
|
|
action.execute(thermal_info_dict)
|
|
assert Thermal.expect_cooling_level is None
|
|
|
|
moc_get_min_allowed.return_value = 4
|
|
action.execute(thermal_info_dict)
|
|
assert Thermal.expect_cooling_level == 4
|
|
Thermal.last_set_cooling_level = Thermal.expect_cooling_level
|
|
|
|
mock_check_trustable.return_value = 'untrust'
|
|
mock_get_min_amb.return_value = 31001
|
|
action.execute(thermal_info_dict)
|
|
assert Thermal.expect_cooling_level == 5
|
|
|
|
|
|
@patch('sonic_platform.thermal.Thermal.set_cooling_state')
|
|
@patch('sonic_platform.utils.read_int_from_file')
|
|
def test_monitor_asic_themal_zone(mock_read_int, mock_set_cooling_state):
|
|
mock_read_int.side_effect = [111000, 105000]
|
|
Thermal.monitor_asic_themal_zone()
|
|
assert Thermal.expect_cooling_state == MAX_COOLING_LEVEL
|
|
Thermal.commit_cooling_level({})
|
|
mock_set_cooling_state.assert_called_with(MAX_COOLING_LEVEL)
|
|
mock_read_int.reset()
|
|
mock_read_int.side_effect = [104000, 105000]
|
|
Thermal.monitor_asic_themal_zone()
|
|
assert Thermal.expect_cooling_state is None
|
|
|
|
|
|
def test_set_expect_cooling_level():
|
|
Thermal.set_expect_cooling_level(5)
|
|
assert Thermal.expect_cooling_level == 5
|
|
|
|
Thermal.set_expect_cooling_level(3)
|
|
assert Thermal.expect_cooling_level == 5
|
|
|
|
Thermal.set_expect_cooling_level(10)
|
|
assert Thermal.expect_cooling_level == 10
|
|
|
|
|
|
@patch('sonic_platform.thermal.Thermal.commit_cooling_level', MagicMock())
|
|
@patch('sonic_platform.thermal_conditions.AnyFanFaultCondition.is_match')
|
|
@patch('sonic_platform.thermal_manager.ThermalManager._collect_thermal_information')
|
|
@patch('sonic_platform.thermal.Thermal.set_expect_cooling_level')
|
|
def test_run_policy(mock_expect, mock_collect_info, mock_match, thermal_manager):
|
|
chassis = MockChassis()
|
|
mock_collect_info.side_effect = Exception('')
|
|
thermal_manager.run_policy(chassis)
|
|
mock_expect.assert_called_with(MAX_COOLING_LEVEL)
|
|
|
|
mock_collect_info.side_effect = None
|
|
mock_expect.reset_mock()
|
|
mock_match.side_effect = Exception('')
|
|
thermal_manager.run_policy(chassis)
|
|
mock_expect.assert_called_with(MAX_COOLING_LEVEL)
|
|
|
|
thermal_manager.stop()
|
|
mock_expect.reset_mock()
|
|
thermal_manager.run_policy(chassis)
|
|
assert mock_expect.call_count == 0
|
|
|