[Mellanox] Add CPU thermal control for Nvidia platforms (#10202)

Why I did it
Add CPU thermal control for Nvidia platforms which will be enabled for platforms that have heavy CPU load. Now it is only enabled on 4800, and it will be enabled on future platforms.

How I did it
Check CPU pack temperature and update cooling level accordingly

How to verify it
Manual test
Added sonic-mgmt test case, PR link will update later
This commit is contained in:
Junchao-Mellanox 2022-03-22 00:54:52 +08:00 committed by GitHub
parent 4caa887642
commit f0ddd102d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 6 deletions

View File

@ -0,0 +1,70 @@
#
# Copyright (c) 2019-2022 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#############################################################################
# Mellanox
#
# Module contains an implementation of SONiC Platform Base API and
# provides the Chassis information which are available in the platform
#
#############################################################################
from sonic_py_common.task_base import ThreadTaskBase
from . import utils
from .device_data import DeviceDataManager
class CPUThermalControl(ThreadTaskBase):
CPU_COOLING_STATE = '/var/run/hw-management/thermal/cooling2_cur_state'
CPU_TEMP_FILE = '/var/run/hw-management/thermal/cpu_pack'
MAX_COOLING_STATE = 10
MIN_COOLING_STATE = 2
INTERVAL = 3
def __init__(self):
super(CPUThermalControl, self).__init__()
self.temp_low, self.temp_high = DeviceDataManager.get_cpu_thermal_threshold()
def task_worker(self):
last_temp = 0
while not self.task_stopping_event.wait(self.INTERVAL):
last_temp = self.run(last_temp)
def run(self, last_temp):
current_temp = self.read_cpu_temp()
if current_temp < self.temp_low:
self.set_cooling_state(self.MIN_COOLING_STATE)
elif current_temp > self.temp_high:
self.set_cooling_state(self.MAX_COOLING_STATE)
else:
cooling_state = self.get_cooling_state()
if current_temp > last_temp:
self.set_cooling_state(min(cooling_state + 1, self.MAX_COOLING_STATE))
elif current_temp < last_temp:
self.set_cooling_state(max(cooling_state - 1, self.MIN_COOLING_STATE))
return current_temp
def set_cooling_state(self, state):
utils.write_file(self.CPU_COOLING_STATE, state, log_func=None)
def get_cooling_state(self):
return utils.read_int_from_file(self.CPU_COOLING_STATE, default=self.MAX_COOLING_STATE, log_func=None)
def read_cpu_temp(self):
cpu_temp = utils.read_int_from_file(self.CPU_TEMP_FILE, default=self.temp_high, log_func=None)
return cpu_temp if cpu_temp <= 1000 else int(cpu_temp / 1000)

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 NVIDIA CORPORATION & AFFILIATES. # Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0 # Apache-2.0
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
@ -148,7 +148,8 @@ DEVICE_DATA = {
'thermal': { 'thermal': {
"capability": { "capability": {
"comex_amb": False "comex_amb": False
} },
'cpu_threshold': (80, 95) # min=80, max=95
}, },
'sfp': { 'sfp': {
'max_port_per_line_card': 16 'max_port_per_line_card': 16
@ -263,3 +264,20 @@ class DeviceDataManager:
if not sfp_data: if not sfp_data:
return 0 return 0
return sfp_data.get('max_port_per_line_card', 0) return sfp_data.get('max_port_per_line_card', 0)
@classmethod
def is_cpu_thermal_control_supported(cls):
return cls.get_cpu_thermal_threshold() != (None, None)
@classmethod
@utils.read_only_cache()
def get_cpu_thermal_threshold(cls):
platform_data = DEVICE_DATA.get(cls.get_platform_name(), None)
if not platform_data:
return None, None
thermal_data = platform_data.get('thermal', None)
if not thermal_data:
return None, None
return thermal_data.get('cpu_threshold', (None, None))

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. # Copyright (c) 2019-2022 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0 # Apache-2.0
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
@ -524,7 +524,7 @@ class Thermal(ThermalBase):
else: else:
cls.expect_cooling_state = None cls.expect_cooling_state = None
class RemovableThermal(Thermal): class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position) super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position)

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 NVIDIA CORPORATION & AFFILIATES. # Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0 # Apache-2.0
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
@ -15,6 +15,8 @@
# limitations under the License. # limitations under the License.
# #
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from .cpu_thermal_control import CPUThermalControl
from .device_data import DeviceDataManager
from .thermal_actions import * from .thermal_actions import *
from .thermal_conditions import * from .thermal_conditions import *
from .thermal_infos import * from .thermal_infos import *
@ -22,6 +24,8 @@ from .thermal import logger, MAX_COOLING_LEVEL, Thermal
class ThermalManager(ThermalManagerBase): class ThermalManager(ThermalManagerBase):
cpu_thermal_control = None
@classmethod @classmethod
def start_thermal_control_algorithm(cls): def start_thermal_control_algorithm(cls):
""" """
@ -42,8 +46,30 @@ class ThermalManager(ThermalManagerBase):
""" """
Thermal.set_thermal_algorithm_status(False) Thermal.set_thermal_algorithm_status(False)
@classmethod
def start_cpu_thermal_control_algoritm(cls):
if cls.cpu_thermal_control:
return
if not DeviceDataManager.is_cpu_thermal_control_supported():
return
cls.cpu_thermal_control = CPUThermalControl()
cls.cpu_thermal_control.task_run()
@classmethod
def stop_cpu_thermal_control_algoritm(cls):
if cls.cpu_thermal_control:
cls.cpu_thermal_control.task_stop()
cls.cpu_thermal_control = None
@classmethod @classmethod
def run_policy(cls, chassis): def run_policy(cls, chassis):
if cls._running:
cls.start_cpu_thermal_control_algoritm()
else:
cls.stop_cpu_thermal_control_algoritm()
if not cls._policy_dict: if not cls._policy_dict:
return return
@ -59,7 +85,6 @@ class ThermalManager(ThermalManagerBase):
if not cls._running: if not cls._running:
return return
try: try:
print(policy.name)
if policy.is_match(cls._thermal_info_dict): if policy.is_match(cls._thermal_info_dict):
policy.do_action(cls._thermal_info_dict) policy.do_action(cls._thermal_info_dict)
except Exception as e: except Exception as e:

View File

@ -0,0 +1,83 @@
#
# Copyright (c) 2019-2022 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#############################################################################
# Mellanox
#
# Module contains an implementation of SONiC Platform Base API and
# provides the Chassis information which are available in the platform
#
#############################################################################
import glob
import os
import pytest
import sys
if sys.version_info.major == 3:
from unittest import mock
else:
import mock
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from sonic_platform.cpu_thermal_control import CPUThermalControl
class TestCPUThermalControl:
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_cpu_thermal_threshold', mock.MagicMock(return_value=(85, 95)))
@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.utils.write_file')
def test_run(self, mock_write_file, mock_read_file):
instance = CPUThermalControl()
file_content = {
CPUThermalControl.CPU_COOLING_STATE: 5,
CPUThermalControl.CPU_TEMP_FILE: instance.temp_high + 1
}
def read_file(file_path, **kwargs):
return file_content[file_path]
mock_read_file.side_effect = read_file
# Test current temp is higher than high threshold
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None)
# Test current temp is lower than low threshold
file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low - 1
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None)
# Test current temp increasing
file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 6, log_func=None)
# Test current temp decreasing
instance.run(instance.temp_low + 1)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 4, log_func=None)
# Test current temp increasing and current cooling state is already the max
file_content[CPUThermalControl.CPU_TEMP_FILE] = 85
file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MAX_COOLING_STATE
instance.run(84)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None)
# Test current temp decreasing and current cooling state is already the max
file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MIN_COOLING_STATE
instance.run(86)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None)