sonic-buildimage/platform/mellanox/mlnx-platform-api/tests/test_thermal.py
Junchao-Mellanox 2759a42551 [Mellanox] Optimize thermal control policies (#9452)
- Why I did it
Optimize thermal control policies to simplify the logic and add more protection code in policies to make sure it works even if kernel algorithm does not work.

- How I did it
Reduce unused thermal policies
Add timely ASIC temperature check in thermal policy to make sure ASIC temperature and fan speed is coordinated
Minimum allowed fan speed now is calculated by max of the expected fan speed among all policies
Move some logic from fan.py to thermal.py to make it more readable

- How to verify it
1. Manual test
2. Regression
2022-01-22 22:40:38 -08:00

311 lines
15 KiB
Python

#
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import glob
import os
import pytest
import sys
if sys.version_info.major == 3:
from unittest import mock
else:
import mock
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from sonic_platform.chassis import Chassis
from sonic_platform.device_data import DeviceDataManager
class TestThermal:
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_gearbox_count', mock.MagicMock(return_value=2))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_cpu_thermal_count', mock.MagicMock(return_value=2))
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name', mock.MagicMock(return_value='x86_64-mlnx_msn2700-r0'))
def test_chassis_thermal(self):
from sonic_platform.thermal import THERMAL_NAMING_RULE
os.path.exists = mock.MagicMock(return_value=True)
chassis = Chassis()
thermal_list = chassis.get_all_thermals()
assert thermal_list
thermal_dict = {thermal.get_name(): thermal for thermal in thermal_list}
gearbox_thermal_rule = None
cpu_thermal_rule = None
for rule in THERMAL_NAMING_RULE['chassis thermals']:
thermal_type = rule.get('type', 'single')
if thermal_type == 'single':
thermal_name = rule['name']
if rule['temperature'] == 'comex_amb':
assert thermal_name not in thermal_dict
continue
default_present = rule.get('default_present', True)
if not default_present:
assert thermal_name not in thermal_dict
continue
assert thermal_name in thermal_dict
thermal = thermal_dict[thermal_name]
assert rule['temperature'] in thermal.temperature
assert rule['high_threshold'] in thermal.high_threshold if 'high_threshold' in rule else thermal.high_threshold is None
assert rule['high_critical_threshold'] in thermal.high_critical_threshold if 'high_critical_threshold' in rule else thermal.high_critical_threshold is None
else:
if 'Gearbox' in rule['name']:
gearbox_thermal_rule = rule
elif 'CPU Core' in rule['name']:
cpu_thermal_rule = rule
gearbox_thermal_count = 0
cpu_thermal_count = 0
for thermal in thermal_list:
if 'Gearbox' in thermal.get_name():
start_index = gearbox_thermal_rule.get('start_index', 1)
start_index += gearbox_thermal_count
assert thermal.get_name() == gearbox_thermal_rule['name'].format(start_index)
assert gearbox_thermal_rule['temperature'].format(start_index) in thermal.temperature
assert gearbox_thermal_rule['high_threshold'].format(start_index) in thermal.high_threshold
assert gearbox_thermal_rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
gearbox_thermal_count += 1
elif 'CPU Core' in thermal.get_name():
start_index = cpu_thermal_rule.get('start_index', 1)
start_index += cpu_thermal_count
assert thermal.get_name() == cpu_thermal_rule['name'].format(start_index)
assert cpu_thermal_rule['temperature'].format(start_index) in thermal.temperature
assert cpu_thermal_rule['high_threshold'].format(start_index) in thermal.high_threshold
assert cpu_thermal_rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
cpu_thermal_count += 1
assert gearbox_thermal_count == 2
assert cpu_thermal_count == 2
def test_chassis_thermal_includes(self):
from sonic_platform.thermal import THERMAL_NAMING_RULE
DeviceDataManager.get_platform_name = mock.MagicMock(return_value='x86_64-nvidia_sn2201-r0')
DeviceDataManager.get_thermal_capability = mock.MagicMock(return_value={'comex_amb': False, 'cpu_amb': True, 'swb_amb': True})
chassis = Chassis()
thermal_list = chassis.get_all_thermals()
assert thermal_list
thermal_dict = {thermal.get_name(): thermal for thermal in thermal_list}
for rule in THERMAL_NAMING_RULE['chassis thermals']:
default_present = rule.get('default_present', True)
if not default_present:
thermal_name = rule['name']
assert thermal_name in thermal_dict
def test_psu_thermal(self):
from sonic_platform.thermal import initialize_psu_thermal, THERMAL_NAMING_RULE
os.path.exists = mock.MagicMock(return_value=True)
presence_cb = mock.MagicMock(return_value=(True, ''))
thermal_list = initialize_psu_thermal(0, presence_cb)
assert len(thermal_list) == 1
thermal = thermal_list[0]
rule = THERMAL_NAMING_RULE['psu thermals']
start_index = rule.get('start_index', 1)
assert thermal.get_name() == rule['name'].format(start_index)
assert rule['temperature'].format(start_index) in thermal.temperature
assert rule['high_threshold'].format(start_index) in thermal.high_threshold
assert thermal.high_critical_threshold is None
assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False
presence_cb = mock.MagicMock(return_value=(False, 'Not present'))
thermal_list = initialize_psu_thermal(0, presence_cb)
assert len(thermal_list) == 1
thermal = thermal_list[0]
assert thermal.get_temperature() is None
assert thermal.get_high_threshold() is None
assert thermal.get_high_critical_threshold() is None
def test_sfp_thermal(self):
from sonic_platform.thermal import initialize_sfp_thermal, THERMAL_NAMING_RULE
os.path.exists = mock.MagicMock(return_value=True)
thermal_list = initialize_sfp_thermal(0)
assert len(thermal_list) == 1
thermal = thermal_list[0]
rule = THERMAL_NAMING_RULE['sfp thermals']
start_index = rule.get('start_index', 1)
assert thermal.get_name() == rule['name'].format(start_index)
assert rule['temperature'].format(start_index) in thermal.temperature
assert rule['high_threshold'].format(start_index) in thermal.high_threshold
assert rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False
def test_get_temperature(self):
from sonic_platform.thermal import Thermal
from sonic_platform import utils
thermal = Thermal('test', 'temp_file', None, None, 1)
utils.read_float_from_file = mock.MagicMock(return_value=35727)
assert thermal.get_temperature() == 35.727
utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_temperature() is None
utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None
def test_get_high_threshold(self):
from sonic_platform.thermal import Thermal
from sonic_platform import utils
thermal = Thermal('test', None, None, None, 1)
assert thermal.get_high_threshold() is None
thermal.high_threshold = 'high_th_file'
utils.read_float_from_file = mock.MagicMock(return_value=25833)
assert thermal.get_temperature() == 25.833
utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_temperature() is None
utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None
def test_get_high_critical_threshold(self):
from sonic_platform.thermal import Thermal
from sonic_platform import utils
thermal = Thermal('test', None, None, None, 1)
assert thermal.get_high_critical_threshold() is None
thermal.high_critical_threshold = 'high_th_file'
utils.read_float_from_file = mock.MagicMock(return_value=120839)
assert thermal.get_high_critical_threshold() == 120.839
utils.read_float_from_file = mock.MagicMock(return_value=0.0)
assert thermal.get_high_critical_threshold() is None
utils.read_float_from_file = mock.MagicMock(return_value=None)
assert thermal.get_high_critical_threshold() is None
def test_set_thermal_algorithm_status(self):
from sonic_platform.thermal import Thermal, THERMAL_ZONE_FOLDER_WILDCARD, THERMAL_ZONE_POLICY_FILE, THERMAL_ZONE_MODE_FILE
from sonic_platform import utils
glob.iglob = mock.MagicMock(return_value=['thermal_zone1', 'thermal_zone2'])
utils.write_file = mock.MagicMock()
assert Thermal.set_thermal_algorithm_status(True, False)
for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'step_wise')
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_MODE_FILE), 'enabled')
assert Thermal.set_thermal_algorithm_status(False, False)
for folder in glob.iglob(THERMAL_ZONE_FOLDER_WILDCARD):
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_POLICY_FILE), 'user_space')
utils.write_file.assert_any_call(os.path.join(folder, THERMAL_ZONE_MODE_FILE), 'disabled')
assert not Thermal.set_thermal_algorithm_status(False, False)
assert Thermal.set_thermal_algorithm_status(False)
@mock.patch('glob.iglob', mock.MagicMock(return_value=['thermal_zone1', 'thermal_zone2']))
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_get_min_allowed_cooling_level_by_thermal_zone(self, mock_read_file):
from sonic_platform.thermal import Thermal, THERMAL_ZONE_TEMP_FILE, THERMAL_ZONE_HIGH_THRESHOLD, THERMAL_ZONE_NORMAL_THRESHOLD, MIN_COOLING_LEVEL_FOR_HIGH, MIN_COOLING_LEVEL_FOR_NORMAL
mock_read_file.side_effect = Exception('')
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() is None
mock_file_content = {}
def mock_read_int_from_file(file_path, default=0, raise_exception=False):
return mock_file_content[file_path]
mock_read_file.side_effect = mock_read_int_from_file
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_NORMAL_THRESHOLD)] = 75000
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_HIGH_THRESHOLD)] = 85000
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 69000
mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_NORMAL_THRESHOLD)] = 75000
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_HIGH_THRESHOLD)] = 85000
mock_file_content[os.path.join('thermal_zone2', THERMAL_ZONE_TEMP_FILE)] = 24000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_NORMAL
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 71000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_HIGH
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 79000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() == MIN_COOLING_LEVEL_FOR_HIGH
mock_file_content[os.path.join('thermal_zone1', THERMAL_ZONE_TEMP_FILE)] = 81000
assert Thermal.get_min_allowed_cooling_level_by_thermal_zone() is None
def test_check_module_temperature_trustable(self):
from sonic_platform.thermal import Thermal
from sonic_platform import utils
glob.iglob = mock.MagicMock(return_value=['thermal_zone1', 'thermal_zone2'])
utils.read_int_from_file = mock.MagicMock(return_value=1)
assert Thermal.check_module_temperature_trustable() == 'untrust'
utils.read_int_from_file = mock.MagicMock(return_value=0)
assert Thermal.check_module_temperature_trustable() == 'trust'
def test_get_min_amb_temperature(self):
from sonic_platform.thermal import Thermal, MAX_AMBIENT_TEMP, CHASSIS_THERMAL_SYSFS_FOLDER
from sonic_platform import utils
utils.read_int_from_file = mock.MagicMock(side_effect=Exception(''))
assert Thermal.get_min_amb_temperature() == MAX_AMBIENT_TEMP
mock_file_content = {}
def mock_read_int_from_file(file_path, default=0, raise_exception=False):
return mock_file_content[file_path]
utils.read_int_from_file = mock_read_int_from_file
mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'fan_amb')] = 50
mock_file_content[os.path.join(CHASSIS_THERMAL_SYSFS_FOLDER, 'port_amb')] = 40
assert Thermal.get_min_amb_temperature() == 40
@mock.patch('sonic_platform.utils.write_file')
def test_set_cooling_level(self, mock_write_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.set_cooling_level(10)
calls = [mock.call(COOLING_STATE_PATH, 20, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
pre_call_count = mock_write_file.call_count
Thermal.set_cooling_level(10)
assert pre_call_count == mock_write_file.call_count
Thermal.set_cooling_level(9)
calls = [mock.call(COOLING_STATE_PATH, 19, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
@mock.patch('sonic_platform.utils.write_file')
def test_set_cooling_state(self, mock_write_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.set_cooling_state(10)
calls = [mock.call(COOLING_STATE_PATH, 10, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
pre_call_count = mock_write_file.call_count
Thermal.set_cooling_state(10)
assert pre_call_count == mock_write_file.call_count
Thermal.set_cooling_state(9)
calls = [mock.call(COOLING_STATE_PATH, 9, raise_exception=True)]
mock_write_file.assert_has_calls(calls)
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_get_cooling_level(self, mock_read_file):
from sonic_platform.thermal import Thermal, COOLING_STATE_PATH
Thermal.get_cooling_level()
mock_read_file.assert_called_with(COOLING_STATE_PATH, raise_exception=True)
mock_read_file.side_effect = IOError('')
with pytest.raises(RuntimeError):
Thermal.get_cooling_level()
mock_read_file.side_effect = ValueError('')
with pytest.raises(RuntimeError):
Thermal.get_cooling_level()