diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 6efdfb61f4..f0b73de66c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -82,6 +82,8 @@ class Chassis(ChassisBase): # System UID LED _led_uid = None + chassis_instance = None + def __init__(self): super(Chassis, self).__init__() @@ -127,6 +129,8 @@ class Chassis(ChassisBase): self._RJ45_port_inited = False self._RJ45_port_list = None + Chassis.chassis_instance = self + self.modules_mgmt_thread = threading.Thread() self.modules_changes_queue = queue.Queue() self.modules_mgmt_task_stopping_event = threading.Event() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 9b9e97d189..c5bcfddaf1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -31,6 +31,8 @@ try: from . import utils from .device_data import DeviceDataManager from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase + from sonic_platform_base.sonic_xcvr.fields import consts + from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436 except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -155,6 +157,10 @@ SFP_TYPE_SFF8636 = 'sff8636' # SFP stderr SFP_EEPROM_NOT_AVAILABLE = 'Input/output error' +SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0 +SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0 +SFP_TEMPERATURE_SCALE = 8.0 + # SFP EEPROM limited bytes limited_eeprom = { SFP_TYPE_CMIS: { @@ -264,7 +270,7 @@ class SFP(NvidiaSFPCommon): if slot_id == 0: # For non-modular chassis from .thermal import initialize_sfp_thermal - self._thermal_list = initialize_sfp_thermal(sfp_index) + self._thermal_list = initialize_sfp_thermal(self) else: # For modular chassis # (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT) max_linecard_count = DeviceDataManager.get_linecard_count() @@ -822,6 +828,77 @@ class SFP(NvidiaSFPCommon): api = self.get_xcvr_api() return [False] * api.NUM_CHANNELS if api else None + def get_temperature(self): + try: + if not self.is_sw_control(): + temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input' + if not os.path.exists(temp_file): + logger.log_error(f'Failed to read from file {temp_file} - not exists') + return None + temperature = utils.read_int_from_file(temp_file, + log_func=None) + return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None + except: + return 0.0 + + self.reinit() + temperature = super().get_temperature() + return temperature if temperature is not None else None + + def get_temperature_warning_threashold(self): + """Get temperature warning threshold + + Returns: + int: temperature warning threshold + """ + try: + if not self.is_sw_control(): + emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency', + log_func=None, + default=None) + return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD + except: + return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD + + thresh = self._get_temperature_threshold() + if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh: + return thresh[consts.TEMP_HIGH_WARNING_FIELD] + return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD + + def get_temperature_critical_threashold(self): + """Get temperature critical threshold + + Returns: + int: temperature critical threshold + """ + try: + if not self.is_sw_control(): + critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical', + log_func=None, + default=None) + return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + except: + return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + + thresh = self._get_temperature_threshold() + if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh: + return thresh[consts.TEMP_HIGH_ALARM_FIELD] + return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + + def _get_temperature_threshold(self): + self.reinit() + api = self.get_xcvr_api() + if not api: + return None + + thresh_support = api.get_transceiver_thresholds_support() + if thresh_support: + if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api): + return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD) + return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD) + else: + return None + def get_xcvr_api(self): """ Retrieves the XcvrApi associated with this SFP diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 1a6b45da63..eadc822e3d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -36,6 +36,8 @@ except ImportError as e: # Global logger class instance logger = Logger() +DEFAULT_TEMP_SCALE = 1000 + """ The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types @@ -72,9 +74,11 @@ THERMAL_NAMING_RULE = { "chassis thermals": [ { "name": "ASIC", - "temperature": "asic", - "high_threshold": "asic_temp_emergency", - "high_critical_threshold": "asic_temp_trip_crit" + "temperature": "input", + "high_threshold_default": 105, + "high_critical_threshold_default": 120, + "sysfs_folder": "/sys/module/sx_core/asic0/temperature", + "scale": 8 }, { "name": "Ambient Port Side Temp", @@ -187,8 +191,8 @@ def initialize_psu_thermal(psu_index, presence_cb): return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)] -def initialize_sfp_thermal(sfp_index): - return [create_indexable_thermal(THERMAL_NAMING_RULE['sfp thermals'], sfp_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1)] +def initialize_sfp_thermal(sfp): + return [ModuleThermal(sfp)] def initialize_linecard_thermals(lc_name, lc_index): @@ -214,6 +218,7 @@ def initialize_linecard_sfp_thermal(lc_name, lc_index, sfp_index): def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None): index += rule.get('start_index', 1) name = rule['name'].format(index) + sysfs_folder = rule.get('sysfs_folder', sysfs_folder) temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index)) _check_thermal_sysfs_existence(temp_file) if 'high_threshold' in rule: @@ -226,10 +231,13 @@ def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=No _check_thermal_sysfs_existence(high_crit_th_file) else: high_crit_th_file = None + high_th_default = rule.get('high_threshold_default') + high_crit_th_default = rule.get('high_critical_threshold_default') + scale = rule.get('scale', DEFAULT_TEMP_SCALE) if not presence_cb: - return Thermal(name, temp_file, high_th_file, high_crit_th_file, position) + return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position) else: - return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb) + return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb) def create_single_thermal(rule, sysfs_folder, position, presence_cb=None): @@ -243,6 +251,7 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None): elif not default_present: return None + sysfs_folder = rule.get('sysfs_folder', sysfs_folder) temp_file = os.path.join(sysfs_folder, temp_file) _check_thermal_sysfs_existence(temp_file) if 'high_threshold' in rule: @@ -255,11 +264,14 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None): _check_thermal_sysfs_existence(high_crit_th_file) else: high_crit_th_file = None + high_th_default = rule.get('high_threshold_default') + high_crit_th_default = rule.get('high_critical_threshold_default') + scale = rule.get('scale', DEFAULT_TEMP_SCALE) name = rule['name'] if not presence_cb: - return Thermal(name, temp_file, high_th_file, high_crit_th_file, position) + return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position) else: - return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb) + return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb) def _check_thermal_sysfs_existence(file_path): @@ -268,7 +280,7 @@ def _check_thermal_sysfs_existence(file_path): class Thermal(ThermalBase): - def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position): + def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position): """ index should be a string for category ambient and int for other categories """ @@ -278,6 +290,9 @@ class Thermal(ThermalBase): self.temperature = temp_file self.high_threshold = high_th_file self.high_critical_threshold = high_crit_th_file + self.high_th_default = high_th_default + self.high_crit_th_default = high_crit_th_default + self.scale = scale def get_name(self): """ @@ -297,7 +312,7 @@ class Thermal(ThermalBase): of one degree Celsius, e.g. 30.125 """ value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info) - return value / 1000.0 if (value is not None and value != 0) else None + return value / self.scale if (value is not None and value != 0) else None def get_high_threshold(self): """ @@ -308,9 +323,9 @@ class Thermal(ThermalBase): up to nearest thousandth of one degree Celsius, e.g. 30.125 """ if self.high_threshold is None: - return None + return self.high_th_default value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info) - return value / 1000.0 if (value is not None and value != 0) else None + return value / self.scale if (value is not None and value != 0) else self.high_th_default def get_high_critical_threshold(self): """ @@ -321,9 +336,9 @@ class Thermal(ThermalBase): up to nearest thousandth of one degree Celsius, e.g. 30.125 """ if self.high_critical_threshold is None: - return None + return self.high_crit_th_default value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info) - return value / 1000.0 if (value is not None and value != 0) else None + return value / self.scale if (value is not None and value != 0) else self.high_crit_th_default def get_position_in_parent(self): """ @@ -343,8 +358,8 @@ class Thermal(ThermalBase): class RemovableThermal(Thermal): - def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb): - super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position) + def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb): + super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position) self.presence_cb = presence_cb def get_temperature(self): @@ -388,3 +403,68 @@ class RemovableThermal(Thermal): logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) return None return super(RemovableThermal, self).get_high_critical_threshold() + + +class ModuleThermal(ThermalBase): + def __init__(self, sfp): + """ + index should be a string for category ambient and int for other categories + """ + super(ModuleThermal, self).__init__() + self.name = f'xSFP module {sfp.sdk_index + 1} Temp' + self.sfp = sfp + + def get_name(self): + """ + Retrieves the name of the device + + Returns: + string: The name of the device + """ + return self.name + + def get_temperature(self): + """ + Retrieves current temperature reading from thermal + + Returns: + A float number of current temperature in Celsius up to nearest thousandth + of one degree Celsius, e.g. 30.125 + """ + return self.sfp.get_temperature() + + def get_high_threshold(self): + """ + Retrieves the high threshold temperature of thermal + + Returns: + A float number, the high threshold temperature of thermal in Celsius + up to nearest thousandth of one degree Celsius, e.g. 30.125 + """ + return self.sfp.get_temperature_warning_threashold() + + def get_high_critical_threshold(self): + """ + Retrieves the high critical threshold temperature of thermal + + Returns: + A float number, the high critical threshold temperature of thermal in Celsius + up to nearest thousandth of one degree Celsius, e.g. 30.125 + """ + return self.sfp.get_temperature_critical_threashold() + + def get_position_in_parent(self): + """ + Retrieves 1-based relative physical position in parent device + Returns: + integer: The 1-based relative physical position in parent device + """ + return 1 + + def is_replaceable(self): + """ + Indicate whether this device is replaceable. + Returns: + bool: True if it is replaceable. + """ + return False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index dd3d794d85..9e1aaded05 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -15,9 +15,36 @@ # limitations under the License. # from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from . import thermal_updater +from .device_data import DeviceDataManager class ThermalManager(ThermalManagerBase): + thermal_updater_task = None + @classmethod def run_policy(cls, chassis): pass + + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + if DeviceDataManager.is_independent_mode(): + from .chassis import Chassis + cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps()) + cls.thermal_updater_task.start() + + + @classmethod + def deinitialize(cls): + """ + Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function + is a no-op. + :return: + """ + if DeviceDataManager.is_independent_mode(): + cls.thermal_updater_task.stop() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py new file mode 100644 index 0000000000..ad0b92ef4e --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -0,0 +1,213 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from . import utils +from sonic_py_common import logger + +import sys +import time + +sys.path.append('/run/hw-management/bin') + +try: + import hw_management_independent_mode_update +except ImportError: + # For unit test only + from unittest import mock + hw_management_independent_mode_update = mock.MagicMock() + hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock() + hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock() + hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock() + hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock() + hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock() + + +SFP_TEMPERATURE_SCALE = 1000 +ASIC_TEMPERATURE_SCALE = 125 +ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000 +ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000 + +ERROR_READ_THERMAL_DATA = 254000 + +TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json' +logger = logger.Logger('thermal-updater') + + +class ThermalUpdater: + def __init__(self, sfp_list): + self._sfp_list = sfp_list + self._sfp_status = {} + self._timer = utils.Timer() + + def load_tc_config(self): + asic_poll_interval = 1 + sfp_poll_interval = 10 + data = utils.load_json_file(TC_CONFIG_FILE) + if not data: + logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') + + if data: + dev_parameters = data.get('dev_parameters') + if dev_parameters is not None: + asic_parameter = dev_parameters.get('asic') + if asic_parameter is not None: + asic_poll_interval_config = asic_parameter.get('poll_time') + if asic_poll_interval_config: + asic_poll_interval = int(asic_poll_interval_config) / 2 + module_parameter = dev_parameters.get('module\\d+') + if module_parameter is not None: + sfp_poll_interval_config = module_parameter.get('poll_time') + if sfp_poll_interval_config: + sfp_poll_interval = int(sfp_poll_interval_config) / 2 + + logger.log_notice(f'ASIC polling interval: {asic_poll_interval}') + self._timer.schedule(asic_poll_interval, self.update_asic) + logger.log_notice(f'Module polling interval: {sfp_poll_interval}') + self._timer.schedule(sfp_poll_interval, self.update_module) + + def start(self): + self.clean_thermal_data() + if not self.wait_all_sfp_ready(): + logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend') + self.control_tc(True) + return + self.control_tc(False) + self.load_tc_config() + self._timer.start() + + def stop(self): + self._timer.stop() + self.control_tc(True) + + def control_tc(self, suspend): + logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}') + utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0) + + def clean_thermal_data(self): + hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list)) + hw_management_independent_mode_update.thermal_data_clean_asic(0) + for sfp in self._sfp_list: + hw_management_independent_mode_update.thermal_data_clean_module( + 0, + sfp.sdk_index + 1 + ) + + def wait_all_sfp_ready(self): + logger.log_notice('Waiting for all SFP modules ready...') + max_wait_time = 60 + ready_set = set() + while len(ready_set) != len(self._sfp_list): + for sfp in self._sfp_list: + try: + sfp.is_sw_control() + ready_set.add(sfp) + except: + continue + max_wait_time -= 1 + if max_wait_time == 0: + return False + time.sleep(1) + + logger.log_notice('All SFP modules are ready') + return True + + def get_asic_temp(self): + temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) + return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None + + def get_asic_temp_warning_threashold(self): + emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None) + return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD + + def get_asic_temp_critical_threashold(self): + critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None) + return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD + + def update_single_module(self, sfp): + try: + presence = sfp.get_presence() + pre_presence = self._sfp_status.get(sfp.sdk_index) + if presence: + temperature = sfp.get_temperature() + if temperature == 0: + warning_thresh = 0 + critical_thresh = 0 + fault = 0 + else: + warning_thresh = sfp.get_temperature_warning_threashold() + critical_thresh = sfp.get_temperature_critical_threashold() + fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0 + temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE) + warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE) + critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE) + + hw_management_independent_mode_update.thermal_data_set_module( + 0, # ASIC index always 0 for now + sfp.sdk_index + 1, + temperature, + critical_thresh, + warning_thresh, + fault + ) + else: + if pre_presence != presence: + hw_management_independent_mode_update.thermal_data_clean_module(0, sfp.sdk_index + 1) + + if pre_presence != presence: + self._sfp_status[sfp.sdk_index] = presence + except Exception as e: + logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}') + hw_management_independent_mode_update.thermal_data_set_module( + 0, # ASIC index always 0 for now + sfp.sdk_index + 1, + 0, + 0, + 0, + ERROR_READ_THERMAL_DATA + ) + + def update_module(self): + for sfp in self._sfp_list: + self.update_single_module(sfp) + + def update_asic(self): + try: + asic_temp = self.get_asic_temp() + warn_threshold = self.get_asic_temp_warning_threashold() + critical_threshold = self.get_asic_temp_critical_threashold() + fault = 0 + if asic_temp is None: + logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc') + asic_temp = warn_threshold + fault = ERROR_READ_THERMAL_DATA + + hw_management_independent_mode_update.thermal_data_set_asic( + 0, # ASIC index always 0 for now + asic_temp, + critical_threshold, + warn_threshold, + fault + ) + except Exception as e: + logger.log_error('Failed to update ASIC thermal data - {e}') + hw_management_independent_mode_update.thermal_data_set_asic( + 0, # ASIC index always 0 for now + 0, + 0, + 0, + ERROR_READ_THERMAL_DATA + ) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py index 51e9bc7f03..9db38e6b41 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py @@ -18,6 +18,7 @@ import ctypes import functools import subprocess import json +import queue import sys import threading import time @@ -289,6 +290,60 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs): return False +class TimerEvent: + def __init__(self, interval, cb, repeat): + self.interval = interval + self._cb = cb + self.repeat = repeat + + def execute(self): + self._cb() + + +class Timer(threading.Thread): + def __init__(self): + super(Timer, self).__init__() + self._timestamp_queue = queue.PriorityQueue() + self._wait_event = threading.Event() + self._stop_event = threading.Event() + self._min_timestamp = None + + def schedule(self, interval, cb, repeat=True, run_now=True): + timer_event = TimerEvent(interval, cb, repeat) + self.add_timer_event(timer_event, run_now) + + def add_timer_event(self, timer_event, run_now=True): + timestamp = time.time() + if not run_now: + timestamp += timer_event.interval + + self._timestamp_queue.put_nowait((timestamp, timer_event)) + if self._min_timestamp is not None and timestamp < self._min_timestamp: + self._wait_event.set() + + def stop(self): + if self.is_alive(): + self._wait_event.set() + self._stop_event.set() + self.join() + + def run(self): + while not self._stop_event.is_set(): + now = time.time() + item = self._timestamp_queue.get() + self._min_timestamp = item[0] + if self._min_timestamp > now: + self._wait_event.wait(self._min_timestamp - now) + self._wait_event.clear() + self._timestamp_queue.put(item) + continue + + timer_event = item[1] + timer_event.execute() + if timer_event.repeat: + self.add_timer_event(timer_event, False) + + class DbUtils: lock = threading.Lock() db_instances = threading.local() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py index 6bdc82b5b4..dccc727bfe 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py @@ -292,6 +292,46 @@ class TestSfp: assert sfp.get_transceiver_threshold_info() sfp.reinit() + @mock.patch('os.path.exists') + @mock.patch('sonic_platform.utils.read_int_from_file') + def test_get_temperature(self, mock_read, mock_exists): + sfp = SFP(0) + sfp.is_sw_control = mock.MagicMock(return_value=True) + mock_exists.return_value = False + assert sfp.get_temperature() == None + + mock_exists.return_value = True + assert sfp.get_temperature() == None + + mock_read.return_value = None + sfp.is_sw_control.return_value = False + assert sfp.get_temperature() == None + + mock_read.return_value = 448 + assert sfp.get_temperature() == 56.0 + + def test_get_temperature_threshold(self): + sfp = SFP(0) + sfp.is_sw_control = mock.MagicMock(return_value=True) + assert sfp.get_temperature_warning_threashold() == 70.0 + assert sfp.get_temperature_critical_threashold() == 80.0 + + mock_api = mock.MagicMock() + mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False) + sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api) + assert sfp.get_temperature_warning_threashold() == 70.0 + assert sfp.get_temperature_critical_threashold() == 80.0 + + from sonic_platform_base.sonic_xcvr.fields import consts + mock_api.get_transceiver_thresholds_support.return_value = True + mock_api.xcvr_eeprom = mock.MagicMock() + mock_api.xcvr_eeprom.read = mock.MagicMock(return_value={ + consts.TEMP_HIGH_ALARM_FIELD: 85.0, + consts.TEMP_HIGH_WARNING_FIELD: 75.0 + }) + assert sfp.get_temperature_warning_threashold() == 75.0 + assert sfp.get_temperature_critical_threashold() == 85.0 + @mock.patch('sonic_platform.utils.read_int_from_file') @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode') @mock.patch('sonic_platform.utils.DbUtils.get_db_instance') diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal.py index db81f73096..a59b8dda40 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal.py @@ -31,6 +31,7 @@ sys.path.insert(0, modules_path) import sonic_platform.chassis from sonic_platform.chassis import Chassis from sonic_platform.device_data import DeviceDataManager +from sonic_platform.sfp import SFP sonic_platform.chassis.extract_RJ45_ports_index = mock.MagicMock(return_value=[]) @@ -148,23 +149,27 @@ class TestThermal: @mock.patch('os.path.exists', mock.MagicMock(return_value=True)) def test_sfp_thermal(self): - from sonic_platform.thermal import initialize_sfp_thermal, THERMAL_NAMING_RULE - thermal_list = initialize_sfp_thermal(0) + from sonic_platform.thermal import THERMAL_NAMING_RULE + sfp = SFP(0) + thermal_list = sfp.get_all_thermals() assert len(thermal_list) == 1 thermal = thermal_list[0] rule = THERMAL_NAMING_RULE['sfp thermals'] start_index = rule.get('start_index', 1) assert thermal.get_name() == rule['name'].format(start_index) - assert rule['temperature'].format(start_index) in thermal.temperature - assert rule['high_threshold'].format(start_index) in thermal.high_threshold - assert rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold assert thermal.get_position_in_parent() == 1 assert thermal.is_replaceable() == False + sfp.get_temperature = mock.MagicMock(return_value=35.4) + sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70) + sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80) + assert thermal.get_temperature() == 35.4 + assert thermal.get_high_threshold() == 70 + assert thermal.get_high_critical_threshold() == 80 @mock.patch('sonic_platform.utils.read_float_from_file') def test_get_temperature(self, mock_read): from sonic_platform.thermal import Thermal - thermal = Thermal('test', 'temp_file', None, None, 1) + thermal = Thermal('test', 'temp_file', None, None, None, None, 1000, 1) mock_read.return_value = 35727 assert thermal.get_temperature() == 35.727 @@ -177,7 +182,7 @@ class TestThermal: @mock.patch('sonic_platform.utils.read_float_from_file') def test_get_high_threshold(self, mock_read): from sonic_platform.thermal import Thermal - thermal = Thermal('test', None, None, None, 1) + thermal = Thermal('test', None, None, None, None, None, 1000, 1) assert thermal.get_high_threshold() is None thermal.high_threshold = 'high_th_file' @@ -193,7 +198,7 @@ class TestThermal: @mock.patch('sonic_platform.utils.read_float_from_file') def test_get_high_critical_threshold(self, mock_read): from sonic_platform.thermal import Thermal - thermal = Thermal('test', None, None, None, 1) + thermal = Thermal('test', None, None, None, None, None, 1000, 1) assert thermal.get_high_critical_threshold() is None thermal.high_critical_threshold = 'high_th_file' diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py new file mode 100644 index 0000000000..1a34a7440a --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -0,0 +1,128 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time +from unittest import mock + +from sonic_platform import utils +from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update +from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ + ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD + + +mock_tc_config = """ +{ + "dev_parameters": { + "asic": { + "pwm_min": 20, + "pwm_max": 100, + "val_min": "!70000", + "val_max": "!105000", + "poll_time": 3 + }, + "module\\\\d+": { + "pwm_min": 20, + "pwm_max": 100, + "val_min": 60000, + "val_max": 80000, + "poll_time": 20 + } + } +} +""" + + +class TestThermalUpdater: + def test_load_tc_config_non_exists(self): + updater = ThermalUpdater(None) + updater.load_tc_config() + assert updater._timer._timestamp_queue.qsize() == 2 + + def test_load_tc_config_mocked(self): + updater = ThermalUpdater(None) + mock_os_open = mock.mock_open(read_data=mock_tc_config) + with mock.patch('sonic_platform.utils.open', mock_os_open): + updater.load_tc_config() + assert updater._timer._timestamp_queue.qsize() == 2 + + @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_asic', mock.MagicMock()) + @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_module', mock.MagicMock()) + @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.wait_all_sfp_ready') + @mock.patch('sonic_platform.utils.write_file') + def test_start_stop(self, mock_write, mock_wait): + mock_wait.return_value = True + mock_sfp = mock.MagicMock() + mock_sfp.sdk_index = 1 + updater = ThermalUpdater([mock_sfp]) + updater.start() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) + utils.wait_until(updater._timer.is_alive, timeout=5) + + mock_write.reset_mock() + updater.stop() + assert not updater._timer.is_alive() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) + + mock_wait.return_value = False + mock_write.reset_mock() + updater.start() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) + updater.stop() + + @mock.patch('sonic_platform.thermal_updater.time.sleep', mock.MagicMock()) + def test_wait_all_sfp_ready(self): + mock_sfp = mock.MagicMock() + mock_sfp.is_sw_control = mock.MagicMock(return_value=True) + updater = ThermalUpdater([mock_sfp]) + assert updater.wait_all_sfp_ready() + mock_sfp.is_sw_control.side_effect = Exception('') + assert not updater.wait_all_sfp_ready() + + @mock.patch('sonic_platform.utils.read_int_from_file') + def test_update_asic(self, mock_read): + mock_read.return_value = 8 + updater = ThermalUpdater(None) + assert updater.get_asic_temp() == 1000 + assert updater.get_asic_temp_warning_threashold() == 1000 + assert updater.get_asic_temp_critical_threashold() == 1000 + updater.update_asic() + hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once() + + mock_read.return_value = None + assert updater.get_asic_temp() is None + assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD + assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD + + def test_update_module(self): + mock_sfp = mock.MagicMock() + mock_sfp.sdk_index = 10 + mock_sfp.get_presence = mock.MagicMock(return_value=True) + mock_sfp.get_temperature = mock.MagicMock(return_value=55.0) + mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0) + mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0) + updater = ThermalUpdater([mock_sfp]) + updater.update_module() + hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0) + + mock_sfp.get_temperature = mock.MagicMock(return_value=0.0) + hw_management_independent_mode_update.reset_mock() + updater.update_module() + hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0) + + mock_sfp.get_presence = mock.MagicMock(return_value=False) + updater.update_module() + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_utils.py b/platform/mellanox/mlnx-platform-api/tests/test_utils.py index 04b00f82f4..2a186de7e5 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_utils.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_utils.py @@ -191,6 +191,26 @@ class TestUtils: mock_os_open = mock.mock_open(read_data='a:b') with mock.patch('sonic_platform.utils.open', mock_os_open): assert utils.read_key_value_file('some_file') == {'a':'b'} + mock_os_open = mock.mock_open(read_data='a=b') with mock.patch('sonic_platform.utils.open', mock_os_open): assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'} + + def test_timer(self): + timer = utils.Timer() + timer.start() + mock_cb_1000_run_now = mock.MagicMock() + mock_cb_1000_run_future = mock.MagicMock() + mock_cb_1_run_future_once = mock.MagicMock() + mock_cb_1_run_future_repeat = mock.MagicMock() + timer.schedule(1000, cb=mock_cb_1000_run_now, repeat=False, run_now=True) + timer.schedule(1000, cb=mock_cb_1000_run_future, repeat=False, run_now=False) + timer.schedule(1, cb=mock_cb_1_run_future_once, repeat=False, run_now=False) + timer.schedule(1, cb=mock_cb_1_run_future_repeat, repeat=True, run_now=False) + time.sleep(3) + timer.stop() + + mock_cb_1000_run_now.assert_called_once() + mock_cb_1000_run_future.assert_not_called() + mock_cb_1_run_future_once.assert_called_once() + assert mock_cb_1_run_future_repeat.call_count > 1