[Mellanox] update asic and module temperature in a thread for CMIS management (#16955)

- Why I did it
When module is totally under software control, driver cannot get module temperature/temperature threshold from firmware. In this case, sonic needs to get temperature/temperature threshold from EEPROM. In this PR, a thread thermal updater is created to update module temperature/temperature threshold while software control is enabled.

- How I did it
Query ASIC temperature from SDK sysfs and update hw-management-tc periodically
Query Module temperature from EEPROM and update hw-management-tc periodically

- How to verify it
Manual test
New Unit tests
This commit is contained in:
Junchao-Mellanox 2023-12-13 20:19:44 +08:00 committed by GitHub
parent 0d62cf0e92
commit 1b84f3daa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 675 additions and 26 deletions

View File

@ -82,6 +82,8 @@ class Chassis(ChassisBase):
# System UID LED
_led_uid = None
chassis_instance = None
def __init__(self):
super(Chassis, self).__init__()
@ -127,6 +129,8 @@ class Chassis(ChassisBase):
self._RJ45_port_inited = False
self._RJ45_port_list = None
Chassis.chassis_instance = self
self.modules_mgmt_thread = threading.Thread()
self.modules_changes_queue = queue.Queue()
self.modules_mgmt_task_stopping_event = threading.Event()

View File

@ -31,6 +31,8 @@ try:
from . import utils
from .device_data import DeviceDataManager
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
from sonic_platform_base.sonic_xcvr.fields import consts
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
@ -155,6 +157,10 @@ SFP_TYPE_SFF8636 = 'sff8636'
# SFP stderr
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'
SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
SFP_TEMPERATURE_SCALE = 8.0
# SFP EEPROM limited bytes
limited_eeprom = {
SFP_TYPE_CMIS: {
@ -264,7 +270,7 @@ class SFP(NvidiaSFPCommon):
if slot_id == 0: # For non-modular chassis
from .thermal import initialize_sfp_thermal
self._thermal_list = initialize_sfp_thermal(sfp_index)
self._thermal_list = initialize_sfp_thermal(self)
else: # For modular chassis
# (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT)
max_linecard_count = DeviceDataManager.get_linecard_count()
@ -822,6 +828,77 @@ class SFP(NvidiaSFPCommon):
api = self.get_xcvr_api()
return [False] * api.NUM_CHANNELS if api else None
def get_temperature(self):
try:
if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
if not os.path.exists(temp_file):
logger.log_error(f'Failed to read from file {temp_file} - not exists')
return None
temperature = utils.read_int_from_file(temp_file,
log_func=None)
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None
except:
return 0.0
self.reinit()
temperature = super().get_temperature()
return temperature if temperature is not None else None
def get_temperature_warning_threashold(self):
"""Get temperature warning threshold
Returns:
int: temperature warning threshold
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_temperature_critical_threashold(self):
"""Get temperature critical threshold
Returns:
int: temperature critical threshold
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
def _get_temperature_threshold(self):
self.reinit()
api = self.get_xcvr_api()
if not api:
return None
thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None
def get_xcvr_api(self):
"""
Retrieves the XcvrApi associated with this SFP

View File

@ -36,6 +36,8 @@ except ImportError as e:
# Global logger class instance
logger = Logger()
DEFAULT_TEMP_SCALE = 1000
"""
The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and
high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types
@ -72,9 +74,11 @@ THERMAL_NAMING_RULE = {
"chassis thermals": [
{
"name": "ASIC",
"temperature": "asic",
"high_threshold": "asic_temp_emergency",
"high_critical_threshold": "asic_temp_trip_crit"
"temperature": "input",
"high_threshold_default": 105,
"high_critical_threshold_default": 120,
"sysfs_folder": "/sys/module/sx_core/asic0/temperature",
"scale": 8
},
{
"name": "Ambient Port Side Temp",
@ -187,8 +191,8 @@ def initialize_psu_thermal(psu_index, presence_cb):
return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)]
def initialize_sfp_thermal(sfp_index):
return [create_indexable_thermal(THERMAL_NAMING_RULE['sfp thermals'], sfp_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1)]
def initialize_sfp_thermal(sfp):
return [ModuleThermal(sfp)]
def initialize_linecard_thermals(lc_name, lc_index):
@ -214,6 +218,7 @@ def initialize_linecard_sfp_thermal(lc_name, lc_index, sfp_index):
def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None):
index += rule.get('start_index', 1)
name = rule['name'].format(index)
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index))
_check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule:
@ -226,10 +231,13 @@ def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=No
_check_thermal_sysfs_existence(high_crit_th_file)
else:
high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
@ -243,6 +251,7 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
elif not default_present:
return None
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, temp_file)
_check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule:
@ -255,11 +264,14 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
_check_thermal_sysfs_existence(high_crit_th_file)
else:
high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
name = rule['name']
if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
def _check_thermal_sysfs_existence(file_path):
@ -268,7 +280,7 @@ def _check_thermal_sysfs_existence(file_path):
class Thermal(ThermalBase):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position):
"""
index should be a string for category ambient and int for other categories
"""
@ -278,6 +290,9 @@ class Thermal(ThermalBase):
self.temperature = temp_file
self.high_threshold = high_th_file
self.high_critical_threshold = high_crit_th_file
self.high_th_default = high_th_default
self.high_crit_th_default = high_crit_th_default
self.scale = scale
def get_name(self):
"""
@ -297,7 +312,7 @@ class Thermal(ThermalBase):
of one degree Celsius, e.g. 30.125
"""
value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else None
def get_high_threshold(self):
"""
@ -308,9 +323,9 @@ class Thermal(ThermalBase):
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_threshold is None:
return None
return self.high_th_default
value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else self.high_th_default
def get_high_critical_threshold(self):
"""
@ -321,9 +336,9 @@ class Thermal(ThermalBase):
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_critical_threshold is None:
return None
return self.high_crit_th_default
value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else self.high_crit_th_default
def get_position_in_parent(self):
"""
@ -343,8 +358,8 @@ class Thermal(ThermalBase):
class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position)
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
self.presence_cb = presence_cb
def get_temperature(self):
@ -388,3 +403,68 @@ class RemovableThermal(Thermal):
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
return super(RemovableThermal, self).get_high_critical_threshold()
class ModuleThermal(ThermalBase):
def __init__(self, sfp):
"""
index should be a string for category ambient and int for other categories
"""
super(ModuleThermal, self).__init__()
self.name = f'xSFP module {sfp.sdk_index + 1} Temp'
self.sfp = sfp
def get_name(self):
"""
Retrieves the name of the device
Returns:
string: The name of the device
"""
return self.name
def get_temperature(self):
"""
Retrieves current temperature reading from thermal
Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature()
def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal
Returns:
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_warning_threashold()
def get_high_critical_threshold(self):
"""
Retrieves the high critical threshold temperature of thermal
Returns:
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_critical_threashold()
def get_position_in_parent(self):
"""
Retrieves 1-based relative physical position in parent device
Returns:
integer: The 1-based relative physical position in parent device
"""
return 1
def is_replaceable(self):
"""
Indicate whether this device is replaceable.
Returns:
bool: True if it is replaceable.
"""
return False

View File

@ -15,9 +15,36 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from .device_data import DeviceDataManager
class ThermalManager(ThermalManagerBase):
thermal_updater_task = None
@classmethod
def run_policy(cls, chassis):
pass
@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_independent_mode():
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task.start()
@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
cls.thermal_updater_task.stop()

View File

@ -0,0 +1,213 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from . import utils
from sonic_py_common import logger
import sys
import time
sys.path.append('/run/hw-management/bin')
try:
import hw_management_independent_mode_update
except ImportError:
# For unit test only
from unittest import mock
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()
SFP_TEMPERATURE_SCALE = 1000
ASIC_TEMPERATURE_SCALE = 125
ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000
ERROR_READ_THERMAL_DATA = 254000
TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
logger = logger.Logger('thermal-updater')
class ThermalUpdater:
def __init__(self, sfp_list):
self._sfp_list = sfp_list
self._sfp_status = {}
self._timer = utils.Timer()
def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE)
if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
if data:
dev_parameters = data.get('dev_parameters')
if dev_parameters is not None:
asic_parameter = dev_parameters.get('asic')
if asic_parameter is not None:
asic_poll_interval_config = asic_parameter.get('poll_time')
if asic_poll_interval_config:
asic_poll_interval = int(asic_poll_interval_config) / 2
module_parameter = dev_parameters.get('module\\d+')
if module_parameter is not None:
sfp_poll_interval_config = module_parameter.get('poll_time')
if sfp_poll_interval_config:
sfp_poll_interval = int(sfp_poll_interval_config) / 2
logger.log_notice(f'ASIC polling interval: {asic_poll_interval}')
self._timer.schedule(asic_poll_interval, self.update_asic)
logger.log_notice(f'Module polling interval: {sfp_poll_interval}')
self._timer.schedule(sfp_poll_interval, self.update_module)
def start(self):
self.clean_thermal_data()
if not self.wait_all_sfp_ready():
logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend')
self.control_tc(True)
return
self.control_tc(False)
self.load_tc_config()
self._timer.start()
def stop(self):
self._timer.stop()
self.control_tc(True)
def control_tc(self, suspend):
logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}')
utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0)
def clean_thermal_data(self):
hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list))
hw_management_independent_mode_update.thermal_data_clean_asic(0)
for sfp in self._sfp_list:
hw_management_independent_mode_update.thermal_data_clean_module(
0,
sfp.sdk_index + 1
)
def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60
ready_set = set()
while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list:
try:
sfp.is_sw_control()
ready_set.add(sfp)
except:
continue
max_wait_time -= 1
if max_wait_time == 0:
return False
time.sleep(1)
logger.log_notice('All SFP modules are ready')
return True
def get_asic_temp(self):
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
def get_asic_temp_warning_threashold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_asic_temp_critical_threashold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def update_single_module(self, sfp):
try:
presence = sfp.get_presence()
pre_presence = self._sfp_status.get(sfp.sdk_index)
if presence:
temperature = sfp.get_temperature()
if temperature == 0:
warning_thresh = 0
critical_thresh = 0
fault = 0
else:
warning_thresh = sfp.get_temperature_warning_threashold()
critical_thresh = sfp.get_temperature_critical_threashold()
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
temperature,
critical_thresh,
warning_thresh,
fault
)
else:
if pre_presence != presence:
hw_management_independent_mode_update.thermal_data_clean_module(0, sfp.sdk_index + 1)
if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence
except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
0,
0,
0,
ERROR_READ_THERMAL_DATA
)
def update_module(self):
for sfp in self._sfp_list:
self.update_single_module(sfp)
def update_asic(self):
try:
asic_temp = self.get_asic_temp()
warn_threshold = self.get_asic_temp_warning_threashold()
critical_threshold = self.get_asic_temp_critical_threashold()
fault = 0
if asic_temp is None:
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
asic_temp = warn_threshold
fault = ERROR_READ_THERMAL_DATA
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
asic_temp,
critical_threshold,
warn_threshold,
fault
)
except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
0,
0,
0,
ERROR_READ_THERMAL_DATA
)

View File

@ -18,6 +18,7 @@ import ctypes
import functools
import subprocess
import json
import queue
import sys
import threading
import time
@ -289,6 +290,60 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs):
return False
class TimerEvent:
def __init__(self, interval, cb, repeat):
self.interval = interval
self._cb = cb
self.repeat = repeat
def execute(self):
self._cb()
class Timer(threading.Thread):
def __init__(self):
super(Timer, self).__init__()
self._timestamp_queue = queue.PriorityQueue()
self._wait_event = threading.Event()
self._stop_event = threading.Event()
self._min_timestamp = None
def schedule(self, interval, cb, repeat=True, run_now=True):
timer_event = TimerEvent(interval, cb, repeat)
self.add_timer_event(timer_event, run_now)
def add_timer_event(self, timer_event, run_now=True):
timestamp = time.time()
if not run_now:
timestamp += timer_event.interval
self._timestamp_queue.put_nowait((timestamp, timer_event))
if self._min_timestamp is not None and timestamp < self._min_timestamp:
self._wait_event.set()
def stop(self):
if self.is_alive():
self._wait_event.set()
self._stop_event.set()
self.join()
def run(self):
while not self._stop_event.is_set():
now = time.time()
item = self._timestamp_queue.get()
self._min_timestamp = item[0]
if self._min_timestamp > now:
self._wait_event.wait(self._min_timestamp - now)
self._wait_event.clear()
self._timestamp_queue.put(item)
continue
timer_event = item[1]
timer_event.execute()
if timer_event.repeat:
self.add_timer_event(timer_event, False)
class DbUtils:
lock = threading.Lock()
db_instances = threading.local()

View File

@ -292,6 +292,46 @@ class TestSfp:
assert sfp.get_transceiver_threshold_info()
sfp.reinit()
@mock.patch('os.path.exists')
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_get_temperature(self, mock_read, mock_exists):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
mock_exists.return_value = False
assert sfp.get_temperature() == None
mock_exists.return_value = True
assert sfp.get_temperature() == None
mock_read.return_value = None
sfp.is_sw_control.return_value = False
assert sfp.get_temperature() == None
mock_read.return_value = 448
assert sfp.get_temperature() == 56.0
def test_get_temperature_threshold(self):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
mock_api = mock.MagicMock()
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
from sonic_platform_base.sonic_xcvr.fields import consts
mock_api.get_transceiver_thresholds_support.return_value = True
mock_api.xcvr_eeprom = mock.MagicMock()
mock_api.xcvr_eeprom.read = mock.MagicMock(return_value={
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
consts.TEMP_HIGH_WARNING_FIELD: 75.0
})
assert sfp.get_temperature_warning_threashold() == 75.0
assert sfp.get_temperature_critical_threashold() == 85.0
@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode')
@mock.patch('sonic_platform.utils.DbUtils.get_db_instance')

View File

@ -31,6 +31,7 @@ sys.path.insert(0, modules_path)
import sonic_platform.chassis
from sonic_platform.chassis import Chassis
from sonic_platform.device_data import DeviceDataManager
from sonic_platform.sfp import SFP
sonic_platform.chassis.extract_RJ45_ports_index = mock.MagicMock(return_value=[])
@ -148,23 +149,27 @@ class TestThermal:
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
def test_sfp_thermal(self):
from sonic_platform.thermal import initialize_sfp_thermal, THERMAL_NAMING_RULE
thermal_list = initialize_sfp_thermal(0)
from sonic_platform.thermal import THERMAL_NAMING_RULE
sfp = SFP(0)
thermal_list = sfp.get_all_thermals()
assert len(thermal_list) == 1
thermal = thermal_list[0]
rule = THERMAL_NAMING_RULE['sfp thermals']
start_index = rule.get('start_index', 1)
assert thermal.get_name() == rule['name'].format(start_index)
assert rule['temperature'].format(start_index) in thermal.temperature
assert rule['high_threshold'].format(start_index) in thermal.high_threshold
assert rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False
sfp.get_temperature = mock.MagicMock(return_value=35.4)
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
assert thermal.get_temperature() == 35.4
assert thermal.get_high_threshold() == 70
assert thermal.get_high_critical_threshold() == 80
@mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_temperature(self, mock_read):
from sonic_platform.thermal import Thermal
thermal = Thermal('test', 'temp_file', None, None, 1)
thermal = Thermal('test', 'temp_file', None, None, None, None, 1000, 1)
mock_read.return_value = 35727
assert thermal.get_temperature() == 35.727
@ -177,7 +182,7 @@ class TestThermal:
@mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_high_threshold(self, mock_read):
from sonic_platform.thermal import Thermal
thermal = Thermal('test', None, None, None, 1)
thermal = Thermal('test', None, None, None, None, None, 1000, 1)
assert thermal.get_high_threshold() is None
thermal.high_threshold = 'high_th_file'
@ -193,7 +198,7 @@ class TestThermal:
@mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_high_critical_threshold(self, mock_read):
from sonic_platform.thermal import Thermal
thermal = Thermal('test', None, None, None, 1)
thermal = Thermal('test', None, None, None, None, None, 1000, 1)
assert thermal.get_high_critical_threshold() is None
thermal.high_critical_threshold = 'high_th_file'

View File

@ -0,0 +1,128 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
from unittest import mock
from sonic_platform import utils
from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update
from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
mock_tc_config = """
{
"dev_parameters": {
"asic": {
"pwm_min": 20,
"pwm_max": 100,
"val_min": "!70000",
"val_max": "!105000",
"poll_time": 3
},
"module\\\\d+": {
"pwm_min": 20,
"pwm_max": 100,
"val_min": 60000,
"val_max": 80000,
"poll_time": 20
}
}
}
"""
class TestThermalUpdater:
def test_load_tc_config_non_exists(self):
updater = ThermalUpdater(None)
updater.load_tc_config()
assert updater._timer._timestamp_queue.qsize() == 2
def test_load_tc_config_mocked(self):
updater = ThermalUpdater(None)
mock_os_open = mock.mock_open(read_data=mock_tc_config)
with mock.patch('sonic_platform.utils.open', mock_os_open):
updater.load_tc_config()
assert updater._timer._timestamp_queue.qsize() == 2
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_asic', mock.MagicMock())
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_module', mock.MagicMock())
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.wait_all_sfp_ready')
@mock.patch('sonic_platform.utils.write_file')
def test_start_stop(self, mock_write, mock_wait):
mock_wait.return_value = True
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 1
updater = ThermalUpdater([mock_sfp])
updater.start()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0)
utils.wait_until(updater._timer.is_alive, timeout=5)
mock_write.reset_mock()
updater.stop()
assert not updater._timer.is_alive()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
mock_wait.return_value = False
mock_write.reset_mock()
updater.start()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
updater.stop()
@mock.patch('sonic_platform.thermal_updater.time.sleep', mock.MagicMock())
def test_wait_all_sfp_ready(self):
mock_sfp = mock.MagicMock()
mock_sfp.is_sw_control = mock.MagicMock(return_value=True)
updater = ThermalUpdater([mock_sfp])
assert updater.wait_all_sfp_ready()
mock_sfp.is_sw_control.side_effect = Exception('')
assert not updater.wait_all_sfp_ready()
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_update_asic(self, mock_read):
mock_read.return_value = 8
updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000
assert updater.get_asic_temp_warning_threashold() == 1000
assert updater.get_asic_temp_critical_threashold() == 1000
updater.update_asic()
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
mock_read.return_value = None
assert updater.get_asic_temp() is None
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def test_update_module(self):
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True)
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
updater = ThermalUpdater([mock_sfp])
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)
mock_sfp.get_temperature = mock.MagicMock(return_value=0.0)
hw_management_independent_mode_update.reset_mock()
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0)
mock_sfp.get_presence = mock.MagicMock(return_value=False)
updater.update_module()
hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11)

View File

@ -191,6 +191,26 @@ class TestUtils:
mock_os_open = mock.mock_open(read_data='a:b')
with mock.patch('sonic_platform.utils.open', mock_os_open):
assert utils.read_key_value_file('some_file') == {'a':'b'}
mock_os_open = mock.mock_open(read_data='a=b')
with mock.patch('sonic_platform.utils.open', mock_os_open):
assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'}
def test_timer(self):
timer = utils.Timer()
timer.start()
mock_cb_1000_run_now = mock.MagicMock()
mock_cb_1000_run_future = mock.MagicMock()
mock_cb_1_run_future_once = mock.MagicMock()
mock_cb_1_run_future_repeat = mock.MagicMock()
timer.schedule(1000, cb=mock_cb_1000_run_now, repeat=False, run_now=True)
timer.schedule(1000, cb=mock_cb_1000_run_future, repeat=False, run_now=False)
timer.schedule(1, cb=mock_cb_1_run_future_once, repeat=False, run_now=False)
timer.schedule(1, cb=mock_cb_1_run_future_repeat, repeat=True, run_now=False)
time.sleep(3)
timer.stop()
mock_cb_1000_run_now.assert_called_once()
mock_cb_1000_run_future.assert_not_called()
mock_cb_1_run_future_once.assert_called_once()
assert mock_cb_1_run_future_repeat.call_count > 1