[Mellanox] update asic and module temperature in a thread for CMIS management (#16955)

- Why I did it
When module is totally under software control, driver cannot get module temperature/temperature threshold from firmware. In this case, sonic needs to get temperature/temperature threshold from EEPROM. In this PR, a thread thermal updater is created to update module temperature/temperature threshold while software control is enabled.

- How I did it
Query ASIC temperature from SDK sysfs and update hw-management-tc periodically
Query Module temperature from EEPROM and update hw-management-tc periodically

- How to verify it
Manual test
New Unit tests
This commit is contained in:
Junchao-Mellanox 2023-12-13 20:19:44 +08:00 committed by GitHub
parent 0d62cf0e92
commit 1b84f3daa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 675 additions and 26 deletions

View File

@ -82,6 +82,8 @@ class Chassis(ChassisBase):
# System UID LED # System UID LED
_led_uid = None _led_uid = None
chassis_instance = None
def __init__(self): def __init__(self):
super(Chassis, self).__init__() super(Chassis, self).__init__()
@ -127,6 +129,8 @@ class Chassis(ChassisBase):
self._RJ45_port_inited = False self._RJ45_port_inited = False
self._RJ45_port_list = None self._RJ45_port_list = None
Chassis.chassis_instance = self
self.modules_mgmt_thread = threading.Thread() self.modules_mgmt_thread = threading.Thread()
self.modules_changes_queue = queue.Queue() self.modules_changes_queue = queue.Queue()
self.modules_mgmt_task_stopping_event = threading.Event() self.modules_mgmt_task_stopping_event = threading.Event()

View File

@ -31,6 +31,8 @@ try:
from . import utils from . import utils
from .device_data import DeviceDataManager from .device_data import DeviceDataManager
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
from sonic_platform_base.sonic_xcvr.fields import consts
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436
except ImportError as e: except ImportError as e:
raise ImportError (str(e) + "- required module not found") raise ImportError (str(e) + "- required module not found")
@ -155,6 +157,10 @@ SFP_TYPE_SFF8636 = 'sff8636'
# SFP stderr # SFP stderr
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error' SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'
SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
SFP_TEMPERATURE_SCALE = 8.0
# SFP EEPROM limited bytes # SFP EEPROM limited bytes
limited_eeprom = { limited_eeprom = {
SFP_TYPE_CMIS: { SFP_TYPE_CMIS: {
@ -264,7 +270,7 @@ class SFP(NvidiaSFPCommon):
if slot_id == 0: # For non-modular chassis if slot_id == 0: # For non-modular chassis
from .thermal import initialize_sfp_thermal from .thermal import initialize_sfp_thermal
self._thermal_list = initialize_sfp_thermal(sfp_index) self._thermal_list = initialize_sfp_thermal(self)
else: # For modular chassis else: # For modular chassis
# (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT) # (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT)
max_linecard_count = DeviceDataManager.get_linecard_count() max_linecard_count = DeviceDataManager.get_linecard_count()
@ -822,6 +828,77 @@ class SFP(NvidiaSFPCommon):
api = self.get_xcvr_api() api = self.get_xcvr_api()
return [False] * api.NUM_CHANNELS if api else None return [False] * api.NUM_CHANNELS if api else None
def get_temperature(self):
try:
if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
if not os.path.exists(temp_file):
logger.log_error(f'Failed to read from file {temp_file} - not exists')
return None
temperature = utils.read_int_from_file(temp_file,
log_func=None)
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None
except:
return 0.0
self.reinit()
temperature = super().get_temperature()
return temperature if temperature is not None else None
def get_temperature_warning_threashold(self):
"""Get temperature warning threshold
Returns:
int: temperature warning threshold
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_temperature_critical_threashold(self):
"""Get temperature critical threshold
Returns:
int: temperature critical threshold
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
def _get_temperature_threshold(self):
self.reinit()
api = self.get_xcvr_api()
if not api:
return None
thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None
def get_xcvr_api(self): def get_xcvr_api(self):
""" """
Retrieves the XcvrApi associated with this SFP Retrieves the XcvrApi associated with this SFP

View File

@ -36,6 +36,8 @@ except ImportError as e:
# Global logger class instance # Global logger class instance
logger = Logger() logger = Logger()
DEFAULT_TEMP_SCALE = 1000
""" """
The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and
high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types
@ -72,9 +74,11 @@ THERMAL_NAMING_RULE = {
"chassis thermals": [ "chassis thermals": [
{ {
"name": "ASIC", "name": "ASIC",
"temperature": "asic", "temperature": "input",
"high_threshold": "asic_temp_emergency", "high_threshold_default": 105,
"high_critical_threshold": "asic_temp_trip_crit" "high_critical_threshold_default": 120,
"sysfs_folder": "/sys/module/sx_core/asic0/temperature",
"scale": 8
}, },
{ {
"name": "Ambient Port Side Temp", "name": "Ambient Port Side Temp",
@ -187,8 +191,8 @@ def initialize_psu_thermal(psu_index, presence_cb):
return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)] return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)]
def initialize_sfp_thermal(sfp_index): def initialize_sfp_thermal(sfp):
return [create_indexable_thermal(THERMAL_NAMING_RULE['sfp thermals'], sfp_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1)] return [ModuleThermal(sfp)]
def initialize_linecard_thermals(lc_name, lc_index): def initialize_linecard_thermals(lc_name, lc_index):
@ -214,6 +218,7 @@ def initialize_linecard_sfp_thermal(lc_name, lc_index, sfp_index):
def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None): def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None):
index += rule.get('start_index', 1) index += rule.get('start_index', 1)
name = rule['name'].format(index) name = rule['name'].format(index)
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index)) temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index))
_check_thermal_sysfs_existence(temp_file) _check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule: if 'high_threshold' in rule:
@ -226,10 +231,13 @@ def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=No
_check_thermal_sysfs_existence(high_crit_th_file) _check_thermal_sysfs_existence(high_crit_th_file)
else: else:
high_crit_th_file = None high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
if not presence_cb: if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position) return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else: else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb) return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
def create_single_thermal(rule, sysfs_folder, position, presence_cb=None): def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
@ -243,6 +251,7 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
elif not default_present: elif not default_present:
return None return None
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, temp_file) temp_file = os.path.join(sysfs_folder, temp_file)
_check_thermal_sysfs_existence(temp_file) _check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule: if 'high_threshold' in rule:
@ -255,11 +264,14 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
_check_thermal_sysfs_existence(high_crit_th_file) _check_thermal_sysfs_existence(high_crit_th_file)
else: else:
high_crit_th_file = None high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
name = rule['name'] name = rule['name']
if not presence_cb: if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position) return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else: else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb) return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
def _check_thermal_sysfs_existence(file_path): def _check_thermal_sysfs_existence(file_path):
@ -268,7 +280,7 @@ def _check_thermal_sysfs_existence(file_path):
class Thermal(ThermalBase): class Thermal(ThermalBase):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position):
""" """
index should be a string for category ambient and int for other categories index should be a string for category ambient and int for other categories
""" """
@ -278,6 +290,9 @@ class Thermal(ThermalBase):
self.temperature = temp_file self.temperature = temp_file
self.high_threshold = high_th_file self.high_threshold = high_th_file
self.high_critical_threshold = high_crit_th_file self.high_critical_threshold = high_crit_th_file
self.high_th_default = high_th_default
self.high_crit_th_default = high_crit_th_default
self.scale = scale
def get_name(self): def get_name(self):
""" """
@ -297,7 +312,7 @@ class Thermal(ThermalBase):
of one degree Celsius, e.g. 30.125 of one degree Celsius, e.g. 30.125
""" """
value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info) value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None return value / self.scale if (value is not None and value != 0) else None
def get_high_threshold(self): def get_high_threshold(self):
""" """
@ -308,9 +323,9 @@ class Thermal(ThermalBase):
up to nearest thousandth of one degree Celsius, e.g. 30.125 up to nearest thousandth of one degree Celsius, e.g. 30.125
""" """
if self.high_threshold is None: if self.high_threshold is None:
return None return self.high_th_default
value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info) value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None return value / self.scale if (value is not None and value != 0) else self.high_th_default
def get_high_critical_threshold(self): def get_high_critical_threshold(self):
""" """
@ -321,9 +336,9 @@ class Thermal(ThermalBase):
up to nearest thousandth of one degree Celsius, e.g. 30.125 up to nearest thousandth of one degree Celsius, e.g. 30.125
""" """
if self.high_critical_threshold is None: if self.high_critical_threshold is None:
return None return self.high_crit_th_default
value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info) value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None return value / self.scale if (value is not None and value != 0) else self.high_crit_th_default
def get_position_in_parent(self): def get_position_in_parent(self):
""" """
@ -343,8 +358,8 @@ class Thermal(ThermalBase):
class RemovableThermal(Thermal): class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position) super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
self.presence_cb = presence_cb self.presence_cb = presence_cb
def get_temperature(self): def get_temperature(self):
@ -388,3 +403,68 @@ class RemovableThermal(Thermal):
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None return None
return super(RemovableThermal, self).get_high_critical_threshold() return super(RemovableThermal, self).get_high_critical_threshold()
class ModuleThermal(ThermalBase):
def __init__(self, sfp):
"""
index should be a string for category ambient and int for other categories
"""
super(ModuleThermal, self).__init__()
self.name = f'xSFP module {sfp.sdk_index + 1} Temp'
self.sfp = sfp
def get_name(self):
"""
Retrieves the name of the device
Returns:
string: The name of the device
"""
return self.name
def get_temperature(self):
"""
Retrieves current temperature reading from thermal
Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature()
def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal
Returns:
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_warning_threashold()
def get_high_critical_threshold(self):
"""
Retrieves the high critical threshold temperature of thermal
Returns:
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_critical_threashold()
def get_position_in_parent(self):
"""
Retrieves 1-based relative physical position in parent device
Returns:
integer: The 1-based relative physical position in parent device
"""
return 1
def is_replaceable(self):
"""
Indicate whether this device is replaceable.
Returns:
bool: True if it is replaceable.
"""
return False

View File

@ -15,9 +15,36 @@
# limitations under the License. # limitations under the License.
# #
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from .device_data import DeviceDataManager
class ThermalManager(ThermalManagerBase): class ThermalManager(ThermalManagerBase):
thermal_updater_task = None
@classmethod @classmethod
def run_policy(cls, chassis): def run_policy(cls, chassis):
pass pass
@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_independent_mode():
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task.start()
@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
cls.thermal_updater_task.stop()

View File

@ -0,0 +1,213 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from . import utils
from sonic_py_common import logger
import sys
import time
sys.path.append('/run/hw-management/bin')
try:
import hw_management_independent_mode_update
except ImportError:
# For unit test only
from unittest import mock
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()
SFP_TEMPERATURE_SCALE = 1000
ASIC_TEMPERATURE_SCALE = 125
ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000
ERROR_READ_THERMAL_DATA = 254000
TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
logger = logger.Logger('thermal-updater')
class ThermalUpdater:
def __init__(self, sfp_list):
self._sfp_list = sfp_list
self._sfp_status = {}
self._timer = utils.Timer()
def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE)
if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
if data:
dev_parameters = data.get('dev_parameters')
if dev_parameters is not None:
asic_parameter = dev_parameters.get('asic')
if asic_parameter is not None:
asic_poll_interval_config = asic_parameter.get('poll_time')
if asic_poll_interval_config:
asic_poll_interval = int(asic_poll_interval_config) / 2
module_parameter = dev_parameters.get('module\\d+')
if module_parameter is not None:
sfp_poll_interval_config = module_parameter.get('poll_time')
if sfp_poll_interval_config:
sfp_poll_interval = int(sfp_poll_interval_config) / 2
logger.log_notice(f'ASIC polling interval: {asic_poll_interval}')
self._timer.schedule(asic_poll_interval, self.update_asic)
logger.log_notice(f'Module polling interval: {sfp_poll_interval}')
self._timer.schedule(sfp_poll_interval, self.update_module)
def start(self):
self.clean_thermal_data()
if not self.wait_all_sfp_ready():
logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend')
self.control_tc(True)
return
self.control_tc(False)
self.load_tc_config()
self._timer.start()
def stop(self):
self._timer.stop()
self.control_tc(True)
def control_tc(self, suspend):
logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}')
utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0)
def clean_thermal_data(self):
hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list))
hw_management_independent_mode_update.thermal_data_clean_asic(0)
for sfp in self._sfp_list:
hw_management_independent_mode_update.thermal_data_clean_module(
0,
sfp.sdk_index + 1
)
def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60
ready_set = set()
while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list:
try:
sfp.is_sw_control()
ready_set.add(sfp)
except:
continue
max_wait_time -= 1
if max_wait_time == 0:
return False
time.sleep(1)
logger.log_notice('All SFP modules are ready')
return True
def get_asic_temp(self):
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
def get_asic_temp_warning_threashold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
def get_asic_temp_critical_threashold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def update_single_module(self, sfp):
try:
presence = sfp.get_presence()
pre_presence = self._sfp_status.get(sfp.sdk_index)
if presence:
temperature = sfp.get_temperature()
if temperature == 0:
warning_thresh = 0
critical_thresh = 0
fault = 0
else:
warning_thresh = sfp.get_temperature_warning_threashold()
critical_thresh = sfp.get_temperature_critical_threashold()
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
temperature,
critical_thresh,
warning_thresh,
fault
)
else:
if pre_presence != presence:
hw_management_independent_mode_update.thermal_data_clean_module(0, sfp.sdk_index + 1)
if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence
except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
0,
0,
0,
ERROR_READ_THERMAL_DATA
)
def update_module(self):
for sfp in self._sfp_list:
self.update_single_module(sfp)
def update_asic(self):
try:
asic_temp = self.get_asic_temp()
warn_threshold = self.get_asic_temp_warning_threashold()
critical_threshold = self.get_asic_temp_critical_threashold()
fault = 0
if asic_temp is None:
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
asic_temp = warn_threshold
fault = ERROR_READ_THERMAL_DATA
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
asic_temp,
critical_threshold,
warn_threshold,
fault
)
except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
0,
0,
0,
ERROR_READ_THERMAL_DATA
)

View File

@ -18,6 +18,7 @@ import ctypes
import functools import functools
import subprocess import subprocess
import json import json
import queue
import sys import sys
import threading import threading
import time import time
@ -289,6 +290,60 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs):
return False return False
class TimerEvent:
def __init__(self, interval, cb, repeat):
self.interval = interval
self._cb = cb
self.repeat = repeat
def execute(self):
self._cb()
class Timer(threading.Thread):
def __init__(self):
super(Timer, self).__init__()
self._timestamp_queue = queue.PriorityQueue()
self._wait_event = threading.Event()
self._stop_event = threading.Event()
self._min_timestamp = None
def schedule(self, interval, cb, repeat=True, run_now=True):
timer_event = TimerEvent(interval, cb, repeat)
self.add_timer_event(timer_event, run_now)
def add_timer_event(self, timer_event, run_now=True):
timestamp = time.time()
if not run_now:
timestamp += timer_event.interval
self._timestamp_queue.put_nowait((timestamp, timer_event))
if self._min_timestamp is not None and timestamp < self._min_timestamp:
self._wait_event.set()
def stop(self):
if self.is_alive():
self._wait_event.set()
self._stop_event.set()
self.join()
def run(self):
while not self._stop_event.is_set():
now = time.time()
item = self._timestamp_queue.get()
self._min_timestamp = item[0]
if self._min_timestamp > now:
self._wait_event.wait(self._min_timestamp - now)
self._wait_event.clear()
self._timestamp_queue.put(item)
continue
timer_event = item[1]
timer_event.execute()
if timer_event.repeat:
self.add_timer_event(timer_event, False)
class DbUtils: class DbUtils:
lock = threading.Lock() lock = threading.Lock()
db_instances = threading.local() db_instances = threading.local()

View File

@ -292,6 +292,46 @@ class TestSfp:
assert sfp.get_transceiver_threshold_info() assert sfp.get_transceiver_threshold_info()
sfp.reinit() sfp.reinit()
@mock.patch('os.path.exists')
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_get_temperature(self, mock_read, mock_exists):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
mock_exists.return_value = False
assert sfp.get_temperature() == None
mock_exists.return_value = True
assert sfp.get_temperature() == None
mock_read.return_value = None
sfp.is_sw_control.return_value = False
assert sfp.get_temperature() == None
mock_read.return_value = 448
assert sfp.get_temperature() == 56.0
def test_get_temperature_threshold(self):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
mock_api = mock.MagicMock()
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
from sonic_platform_base.sonic_xcvr.fields import consts
mock_api.get_transceiver_thresholds_support.return_value = True
mock_api.xcvr_eeprom = mock.MagicMock()
mock_api.xcvr_eeprom.read = mock.MagicMock(return_value={
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
consts.TEMP_HIGH_WARNING_FIELD: 75.0
})
assert sfp.get_temperature_warning_threashold() == 75.0
assert sfp.get_temperature_critical_threashold() == 85.0
@mock.patch('sonic_platform.utils.read_int_from_file') @mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode') @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode')
@mock.patch('sonic_platform.utils.DbUtils.get_db_instance') @mock.patch('sonic_platform.utils.DbUtils.get_db_instance')

View File

@ -31,6 +31,7 @@ sys.path.insert(0, modules_path)
import sonic_platform.chassis import sonic_platform.chassis
from sonic_platform.chassis import Chassis from sonic_platform.chassis import Chassis
from sonic_platform.device_data import DeviceDataManager from sonic_platform.device_data import DeviceDataManager
from sonic_platform.sfp import SFP
sonic_platform.chassis.extract_RJ45_ports_index = mock.MagicMock(return_value=[]) sonic_platform.chassis.extract_RJ45_ports_index = mock.MagicMock(return_value=[])
@ -148,23 +149,27 @@ class TestThermal:
@mock.patch('os.path.exists', mock.MagicMock(return_value=True)) @mock.patch('os.path.exists', mock.MagicMock(return_value=True))
def test_sfp_thermal(self): def test_sfp_thermal(self):
from sonic_platform.thermal import initialize_sfp_thermal, THERMAL_NAMING_RULE from sonic_platform.thermal import THERMAL_NAMING_RULE
thermal_list = initialize_sfp_thermal(0) sfp = SFP(0)
thermal_list = sfp.get_all_thermals()
assert len(thermal_list) == 1 assert len(thermal_list) == 1
thermal = thermal_list[0] thermal = thermal_list[0]
rule = THERMAL_NAMING_RULE['sfp thermals'] rule = THERMAL_NAMING_RULE['sfp thermals']
start_index = rule.get('start_index', 1) start_index = rule.get('start_index', 1)
assert thermal.get_name() == rule['name'].format(start_index) assert thermal.get_name() == rule['name'].format(start_index)
assert rule['temperature'].format(start_index) in thermal.temperature
assert rule['high_threshold'].format(start_index) in thermal.high_threshold
assert rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
assert thermal.get_position_in_parent() == 1 assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False assert thermal.is_replaceable() == False
sfp.get_temperature = mock.MagicMock(return_value=35.4)
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
assert thermal.get_temperature() == 35.4
assert thermal.get_high_threshold() == 70
assert thermal.get_high_critical_threshold() == 80
@mock.patch('sonic_platform.utils.read_float_from_file') @mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_temperature(self, mock_read): def test_get_temperature(self, mock_read):
from sonic_platform.thermal import Thermal from sonic_platform.thermal import Thermal
thermal = Thermal('test', 'temp_file', None, None, 1) thermal = Thermal('test', 'temp_file', None, None, None, None, 1000, 1)
mock_read.return_value = 35727 mock_read.return_value = 35727
assert thermal.get_temperature() == 35.727 assert thermal.get_temperature() == 35.727
@ -177,7 +182,7 @@ class TestThermal:
@mock.patch('sonic_platform.utils.read_float_from_file') @mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_high_threshold(self, mock_read): def test_get_high_threshold(self, mock_read):
from sonic_platform.thermal import Thermal from sonic_platform.thermal import Thermal
thermal = Thermal('test', None, None, None, 1) thermal = Thermal('test', None, None, None, None, None, 1000, 1)
assert thermal.get_high_threshold() is None assert thermal.get_high_threshold() is None
thermal.high_threshold = 'high_th_file' thermal.high_threshold = 'high_th_file'
@ -193,7 +198,7 @@ class TestThermal:
@mock.patch('sonic_platform.utils.read_float_from_file') @mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_high_critical_threshold(self, mock_read): def test_get_high_critical_threshold(self, mock_read):
from sonic_platform.thermal import Thermal from sonic_platform.thermal import Thermal
thermal = Thermal('test', None, None, None, 1) thermal = Thermal('test', None, None, None, None, None, 1000, 1)
assert thermal.get_high_critical_threshold() is None assert thermal.get_high_critical_threshold() is None
thermal.high_critical_threshold = 'high_th_file' thermal.high_critical_threshold = 'high_th_file'

View File

@ -0,0 +1,128 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
from unittest import mock
from sonic_platform import utils
from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update
from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
mock_tc_config = """
{
"dev_parameters": {
"asic": {
"pwm_min": 20,
"pwm_max": 100,
"val_min": "!70000",
"val_max": "!105000",
"poll_time": 3
},
"module\\\\d+": {
"pwm_min": 20,
"pwm_max": 100,
"val_min": 60000,
"val_max": 80000,
"poll_time": 20
}
}
}
"""
class TestThermalUpdater:
def test_load_tc_config_non_exists(self):
updater = ThermalUpdater(None)
updater.load_tc_config()
assert updater._timer._timestamp_queue.qsize() == 2
def test_load_tc_config_mocked(self):
updater = ThermalUpdater(None)
mock_os_open = mock.mock_open(read_data=mock_tc_config)
with mock.patch('sonic_platform.utils.open', mock_os_open):
updater.load_tc_config()
assert updater._timer._timestamp_queue.qsize() == 2
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_asic', mock.MagicMock())
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_module', mock.MagicMock())
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.wait_all_sfp_ready')
@mock.patch('sonic_platform.utils.write_file')
def test_start_stop(self, mock_write, mock_wait):
mock_wait.return_value = True
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 1
updater = ThermalUpdater([mock_sfp])
updater.start()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0)
utils.wait_until(updater._timer.is_alive, timeout=5)
mock_write.reset_mock()
updater.stop()
assert not updater._timer.is_alive()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
mock_wait.return_value = False
mock_write.reset_mock()
updater.start()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
updater.stop()
@mock.patch('sonic_platform.thermal_updater.time.sleep', mock.MagicMock())
def test_wait_all_sfp_ready(self):
mock_sfp = mock.MagicMock()
mock_sfp.is_sw_control = mock.MagicMock(return_value=True)
updater = ThermalUpdater([mock_sfp])
assert updater.wait_all_sfp_ready()
mock_sfp.is_sw_control.side_effect = Exception('')
assert not updater.wait_all_sfp_ready()
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_update_asic(self, mock_read):
mock_read.return_value = 8
updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000
assert updater.get_asic_temp_warning_threashold() == 1000
assert updater.get_asic_temp_critical_threashold() == 1000
updater.update_asic()
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
mock_read.return_value = None
assert updater.get_asic_temp() is None
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
def test_update_module(self):
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True)
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
updater = ThermalUpdater([mock_sfp])
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)
mock_sfp.get_temperature = mock.MagicMock(return_value=0.0)
hw_management_independent_mode_update.reset_mock()
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0)
mock_sfp.get_presence = mock.MagicMock(return_value=False)
updater.update_module()
hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11)

View File

@ -191,6 +191,26 @@ class TestUtils:
mock_os_open = mock.mock_open(read_data='a:b') mock_os_open = mock.mock_open(read_data='a:b')
with mock.patch('sonic_platform.utils.open', mock_os_open): with mock.patch('sonic_platform.utils.open', mock_os_open):
assert utils.read_key_value_file('some_file') == {'a':'b'} assert utils.read_key_value_file('some_file') == {'a':'b'}
mock_os_open = mock.mock_open(read_data='a=b') mock_os_open = mock.mock_open(read_data='a=b')
with mock.patch('sonic_platform.utils.open', mock_os_open): with mock.patch('sonic_platform.utils.open', mock_os_open):
assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'} assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'}
def test_timer(self):
timer = utils.Timer()
timer.start()
mock_cb_1000_run_now = mock.MagicMock()
mock_cb_1000_run_future = mock.MagicMock()
mock_cb_1_run_future_once = mock.MagicMock()
mock_cb_1_run_future_repeat = mock.MagicMock()
timer.schedule(1000, cb=mock_cb_1000_run_now, repeat=False, run_now=True)
timer.schedule(1000, cb=mock_cb_1000_run_future, repeat=False, run_now=False)
timer.schedule(1, cb=mock_cb_1_run_future_once, repeat=False, run_now=False)
timer.schedule(1, cb=mock_cb_1_run_future_repeat, repeat=True, run_now=False)
time.sleep(3)
timer.stop()
mock_cb_1000_run_now.assert_called_once()
mock_cb_1000_run_future.assert_not_called()
mock_cb_1_run_future_once.assert_called_once()
assert mock_cb_1_run_future_repeat.call_count > 1