[Mellanox] update asic and module temperature in a thread for CMIS management (#16955)
- Why I did it When module is totally under software control, driver cannot get module temperature/temperature threshold from firmware. In this case, sonic needs to get temperature/temperature threshold from EEPROM. In this PR, a thread thermal updater is created to update module temperature/temperature threshold while software control is enabled. - How I did it Query ASIC temperature from SDK sysfs and update hw-management-tc periodically Query Module temperature from EEPROM and update hw-management-tc periodically - How to verify it Manual test New Unit tests
This commit is contained in:
parent
0d62cf0e92
commit
1b84f3daa5
@ -82,6 +82,8 @@ class Chassis(ChassisBase):
|
||||
# System UID LED
|
||||
_led_uid = None
|
||||
|
||||
chassis_instance = None
|
||||
|
||||
def __init__(self):
|
||||
super(Chassis, self).__init__()
|
||||
|
||||
@ -127,6 +129,8 @@ class Chassis(ChassisBase):
|
||||
self._RJ45_port_inited = False
|
||||
self._RJ45_port_list = None
|
||||
|
||||
Chassis.chassis_instance = self
|
||||
|
||||
self.modules_mgmt_thread = threading.Thread()
|
||||
self.modules_changes_queue = queue.Queue()
|
||||
self.modules_mgmt_task_stopping_event = threading.Event()
|
||||
|
@ -31,6 +31,8 @@ try:
|
||||
from . import utils
|
||||
from .device_data import DeviceDataManager
|
||||
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
|
||||
from sonic_platform_base.sonic_xcvr.fields import consts
|
||||
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436
|
||||
|
||||
except ImportError as e:
|
||||
raise ImportError (str(e) + "- required module not found")
|
||||
@ -155,6 +157,10 @@ SFP_TYPE_SFF8636 = 'sff8636'
|
||||
# SFP stderr
|
||||
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'
|
||||
|
||||
SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
|
||||
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
|
||||
SFP_TEMPERATURE_SCALE = 8.0
|
||||
|
||||
# SFP EEPROM limited bytes
|
||||
limited_eeprom = {
|
||||
SFP_TYPE_CMIS: {
|
||||
@ -264,7 +270,7 @@ class SFP(NvidiaSFPCommon):
|
||||
|
||||
if slot_id == 0: # For non-modular chassis
|
||||
from .thermal import initialize_sfp_thermal
|
||||
self._thermal_list = initialize_sfp_thermal(sfp_index)
|
||||
self._thermal_list = initialize_sfp_thermal(self)
|
||||
else: # For modular chassis
|
||||
# (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT)
|
||||
max_linecard_count = DeviceDataManager.get_linecard_count()
|
||||
@ -822,6 +828,77 @@ class SFP(NvidiaSFPCommon):
|
||||
api = self.get_xcvr_api()
|
||||
return [False] * api.NUM_CHANNELS if api else None
|
||||
|
||||
def get_temperature(self):
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
|
||||
if not os.path.exists(temp_file):
|
||||
logger.log_error(f'Failed to read from file {temp_file} - not exists')
|
||||
return None
|
||||
temperature = utils.read_int_from_file(temp_file,
|
||||
log_func=None)
|
||||
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
self.reinit()
|
||||
temperature = super().get_temperature()
|
||||
return temperature if temperature is not None else None
|
||||
|
||||
def get_temperature_warning_threashold(self):
|
||||
"""Get temperature warning threshold
|
||||
|
||||
Returns:
|
||||
int: temperature warning threshold
|
||||
"""
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
|
||||
log_func=None,
|
||||
default=None)
|
||||
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
except:
|
||||
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
|
||||
thresh = self._get_temperature_threshold()
|
||||
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
|
||||
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
|
||||
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
|
||||
def get_temperature_critical_threashold(self):
|
||||
"""Get temperature critical threshold
|
||||
|
||||
Returns:
|
||||
int: temperature critical threshold
|
||||
"""
|
||||
try:
|
||||
if not self.is_sw_control():
|
||||
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
|
||||
log_func=None,
|
||||
default=None)
|
||||
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
except:
|
||||
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
thresh = self._get_temperature_threshold()
|
||||
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
|
||||
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
|
||||
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
def _get_temperature_threshold(self):
|
||||
self.reinit()
|
||||
api = self.get_xcvr_api()
|
||||
if not api:
|
||||
return None
|
||||
|
||||
thresh_support = api.get_transceiver_thresholds_support()
|
||||
if thresh_support:
|
||||
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
|
||||
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
|
||||
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_xcvr_api(self):
|
||||
"""
|
||||
Retrieves the XcvrApi associated with this SFP
|
||||
|
@ -36,6 +36,8 @@ except ImportError as e:
|
||||
# Global logger class instance
|
||||
logger = Logger()
|
||||
|
||||
DEFAULT_TEMP_SCALE = 1000
|
||||
|
||||
"""
|
||||
The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and
|
||||
high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types
|
||||
@ -72,9 +74,11 @@ THERMAL_NAMING_RULE = {
|
||||
"chassis thermals": [
|
||||
{
|
||||
"name": "ASIC",
|
||||
"temperature": "asic",
|
||||
"high_threshold": "asic_temp_emergency",
|
||||
"high_critical_threshold": "asic_temp_trip_crit"
|
||||
"temperature": "input",
|
||||
"high_threshold_default": 105,
|
||||
"high_critical_threshold_default": 120,
|
||||
"sysfs_folder": "/sys/module/sx_core/asic0/temperature",
|
||||
"scale": 8
|
||||
},
|
||||
{
|
||||
"name": "Ambient Port Side Temp",
|
||||
@ -187,8 +191,8 @@ def initialize_psu_thermal(psu_index, presence_cb):
|
||||
return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)]
|
||||
|
||||
|
||||
def initialize_sfp_thermal(sfp_index):
|
||||
return [create_indexable_thermal(THERMAL_NAMING_RULE['sfp thermals'], sfp_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1)]
|
||||
def initialize_sfp_thermal(sfp):
|
||||
return [ModuleThermal(sfp)]
|
||||
|
||||
|
||||
def initialize_linecard_thermals(lc_name, lc_index):
|
||||
@ -214,6 +218,7 @@ def initialize_linecard_sfp_thermal(lc_name, lc_index, sfp_index):
|
||||
def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None):
|
||||
index += rule.get('start_index', 1)
|
||||
name = rule['name'].format(index)
|
||||
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
|
||||
temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index))
|
||||
_check_thermal_sysfs_existence(temp_file)
|
||||
if 'high_threshold' in rule:
|
||||
@ -226,10 +231,13 @@ def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=No
|
||||
_check_thermal_sysfs_existence(high_crit_th_file)
|
||||
else:
|
||||
high_crit_th_file = None
|
||||
high_th_default = rule.get('high_threshold_default')
|
||||
high_crit_th_default = rule.get('high_critical_threshold_default')
|
||||
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
|
||||
if not presence_cb:
|
||||
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
|
||||
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
|
||||
else:
|
||||
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
|
||||
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
|
||||
|
||||
|
||||
def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
|
||||
@ -243,6 +251,7 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
|
||||
elif not default_present:
|
||||
return None
|
||||
|
||||
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
|
||||
temp_file = os.path.join(sysfs_folder, temp_file)
|
||||
_check_thermal_sysfs_existence(temp_file)
|
||||
if 'high_threshold' in rule:
|
||||
@ -255,11 +264,14 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
|
||||
_check_thermal_sysfs_existence(high_crit_th_file)
|
||||
else:
|
||||
high_crit_th_file = None
|
||||
high_th_default = rule.get('high_threshold_default')
|
||||
high_crit_th_default = rule.get('high_critical_threshold_default')
|
||||
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
|
||||
name = rule['name']
|
||||
if not presence_cb:
|
||||
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
|
||||
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
|
||||
else:
|
||||
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
|
||||
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)
|
||||
|
||||
|
||||
def _check_thermal_sysfs_existence(file_path):
|
||||
@ -268,7 +280,7 @@ def _check_thermal_sysfs_existence(file_path):
|
||||
|
||||
|
||||
class Thermal(ThermalBase):
|
||||
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position):
|
||||
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position):
|
||||
"""
|
||||
index should be a string for category ambient and int for other categories
|
||||
"""
|
||||
@ -278,6 +290,9 @@ class Thermal(ThermalBase):
|
||||
self.temperature = temp_file
|
||||
self.high_threshold = high_th_file
|
||||
self.high_critical_threshold = high_crit_th_file
|
||||
self.high_th_default = high_th_default
|
||||
self.high_crit_th_default = high_crit_th_default
|
||||
self.scale = scale
|
||||
|
||||
def get_name(self):
|
||||
"""
|
||||
@ -297,7 +312,7 @@ class Thermal(ThermalBase):
|
||||
of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info)
|
||||
return value / 1000.0 if (value is not None and value != 0) else None
|
||||
return value / self.scale if (value is not None and value != 0) else None
|
||||
|
||||
def get_high_threshold(self):
|
||||
"""
|
||||
@ -308,9 +323,9 @@ class Thermal(ThermalBase):
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
if self.high_threshold is None:
|
||||
return None
|
||||
return self.high_th_default
|
||||
value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info)
|
||||
return value / 1000.0 if (value is not None and value != 0) else None
|
||||
return value / self.scale if (value is not None and value != 0) else self.high_th_default
|
||||
|
||||
def get_high_critical_threshold(self):
|
||||
"""
|
||||
@ -321,9 +336,9 @@ class Thermal(ThermalBase):
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
if self.high_critical_threshold is None:
|
||||
return None
|
||||
return self.high_crit_th_default
|
||||
value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info)
|
||||
return value / 1000.0 if (value is not None and value != 0) else None
|
||||
return value / self.scale if (value is not None and value != 0) else self.high_crit_th_default
|
||||
|
||||
def get_position_in_parent(self):
|
||||
"""
|
||||
@ -343,8 +358,8 @@ class Thermal(ThermalBase):
|
||||
|
||||
|
||||
class RemovableThermal(Thermal):
|
||||
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
|
||||
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position)
|
||||
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb):
|
||||
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
|
||||
self.presence_cb = presence_cb
|
||||
|
||||
def get_temperature(self):
|
||||
@ -388,3 +403,68 @@ class RemovableThermal(Thermal):
|
||||
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
|
||||
return None
|
||||
return super(RemovableThermal, self).get_high_critical_threshold()
|
||||
|
||||
|
||||
class ModuleThermal(ThermalBase):
|
||||
def __init__(self, sfp):
|
||||
"""
|
||||
index should be a string for category ambient and int for other categories
|
||||
"""
|
||||
super(ModuleThermal, self).__init__()
|
||||
self.name = f'xSFP module {sfp.sdk_index + 1} Temp'
|
||||
self.sfp = sfp
|
||||
|
||||
def get_name(self):
|
||||
"""
|
||||
Retrieves the name of the device
|
||||
|
||||
Returns:
|
||||
string: The name of the device
|
||||
"""
|
||||
return self.name
|
||||
|
||||
def get_temperature(self):
|
||||
"""
|
||||
Retrieves current temperature reading from thermal
|
||||
|
||||
Returns:
|
||||
A float number of current temperature in Celsius up to nearest thousandth
|
||||
of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature()
|
||||
|
||||
def get_high_threshold(self):
|
||||
"""
|
||||
Retrieves the high threshold temperature of thermal
|
||||
|
||||
Returns:
|
||||
A float number, the high threshold temperature of thermal in Celsius
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature_warning_threashold()
|
||||
|
||||
def get_high_critical_threshold(self):
|
||||
"""
|
||||
Retrieves the high critical threshold temperature of thermal
|
||||
|
||||
Returns:
|
||||
A float number, the high critical threshold temperature of thermal in Celsius
|
||||
up to nearest thousandth of one degree Celsius, e.g. 30.125
|
||||
"""
|
||||
return self.sfp.get_temperature_critical_threashold()
|
||||
|
||||
def get_position_in_parent(self):
|
||||
"""
|
||||
Retrieves 1-based relative physical position in parent device
|
||||
Returns:
|
||||
integer: The 1-based relative physical position in parent device
|
||||
"""
|
||||
return 1
|
||||
|
||||
def is_replaceable(self):
|
||||
"""
|
||||
Indicate whether this device is replaceable.
|
||||
Returns:
|
||||
bool: True if it is replaceable.
|
||||
"""
|
||||
return False
|
||||
|
@ -15,9 +15,36 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
|
||||
from . import thermal_updater
|
||||
from .device_data import DeviceDataManager
|
||||
|
||||
|
||||
class ThermalManager(ThermalManagerBase):
|
||||
thermal_updater_task = None
|
||||
|
||||
@classmethod
|
||||
def run_policy(cls, chassis):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def initialize(cls):
|
||||
"""
|
||||
Initialize thermal manager, including register thermal condition types and thermal action types
|
||||
and any other vendor specific initialization.
|
||||
:return:
|
||||
"""
|
||||
if DeviceDataManager.is_independent_mode():
|
||||
from .chassis import Chassis
|
||||
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
|
||||
cls.thermal_updater_task.start()
|
||||
|
||||
|
||||
@classmethod
|
||||
def deinitialize(cls):
|
||||
"""
|
||||
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
|
||||
is a no-op.
|
||||
:return:
|
||||
"""
|
||||
if DeviceDataManager.is_independent_mode():
|
||||
cls.thermal_updater_task.stop()
|
||||
|
@ -0,0 +1,213 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
|
||||
# Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from . import utils
|
||||
from sonic_py_common import logger
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.append('/run/hw-management/bin')
|
||||
|
||||
try:
|
||||
import hw_management_independent_mode_update
|
||||
except ImportError:
|
||||
# For unit test only
|
||||
from unittest import mock
|
||||
hw_management_independent_mode_update = mock.MagicMock()
|
||||
hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock()
|
||||
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
|
||||
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
|
||||
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
|
||||
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()
|
||||
|
||||
|
||||
SFP_TEMPERATURE_SCALE = 1000
|
||||
ASIC_TEMPERATURE_SCALE = 125
|
||||
ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000
|
||||
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000
|
||||
|
||||
ERROR_READ_THERMAL_DATA = 254000
|
||||
|
||||
TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
|
||||
logger = logger.Logger('thermal-updater')
|
||||
|
||||
|
||||
class ThermalUpdater:
|
||||
def __init__(self, sfp_list):
|
||||
self._sfp_list = sfp_list
|
||||
self._sfp_status = {}
|
||||
self._timer = utils.Timer()
|
||||
|
||||
def load_tc_config(self):
|
||||
asic_poll_interval = 1
|
||||
sfp_poll_interval = 10
|
||||
data = utils.load_json_file(TC_CONFIG_FILE)
|
||||
if not data:
|
||||
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
|
||||
|
||||
if data:
|
||||
dev_parameters = data.get('dev_parameters')
|
||||
if dev_parameters is not None:
|
||||
asic_parameter = dev_parameters.get('asic')
|
||||
if asic_parameter is not None:
|
||||
asic_poll_interval_config = asic_parameter.get('poll_time')
|
||||
if asic_poll_interval_config:
|
||||
asic_poll_interval = int(asic_poll_interval_config) / 2
|
||||
module_parameter = dev_parameters.get('module\\d+')
|
||||
if module_parameter is not None:
|
||||
sfp_poll_interval_config = module_parameter.get('poll_time')
|
||||
if sfp_poll_interval_config:
|
||||
sfp_poll_interval = int(sfp_poll_interval_config) / 2
|
||||
|
||||
logger.log_notice(f'ASIC polling interval: {asic_poll_interval}')
|
||||
self._timer.schedule(asic_poll_interval, self.update_asic)
|
||||
logger.log_notice(f'Module polling interval: {sfp_poll_interval}')
|
||||
self._timer.schedule(sfp_poll_interval, self.update_module)
|
||||
|
||||
def start(self):
|
||||
self.clean_thermal_data()
|
||||
if not self.wait_all_sfp_ready():
|
||||
logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend')
|
||||
self.control_tc(True)
|
||||
return
|
||||
self.control_tc(False)
|
||||
self.load_tc_config()
|
||||
self._timer.start()
|
||||
|
||||
def stop(self):
|
||||
self._timer.stop()
|
||||
self.control_tc(True)
|
||||
|
||||
def control_tc(self, suspend):
|
||||
logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}')
|
||||
utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0)
|
||||
|
||||
def clean_thermal_data(self):
|
||||
hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list))
|
||||
hw_management_independent_mode_update.thermal_data_clean_asic(0)
|
||||
for sfp in self._sfp_list:
|
||||
hw_management_independent_mode_update.thermal_data_clean_module(
|
||||
0,
|
||||
sfp.sdk_index + 1
|
||||
)
|
||||
|
||||
def wait_all_sfp_ready(self):
|
||||
logger.log_notice('Waiting for all SFP modules ready...')
|
||||
max_wait_time = 60
|
||||
ready_set = set()
|
||||
while len(ready_set) != len(self._sfp_list):
|
||||
for sfp in self._sfp_list:
|
||||
try:
|
||||
sfp.is_sw_control()
|
||||
ready_set.add(sfp)
|
||||
except:
|
||||
continue
|
||||
max_wait_time -= 1
|
||||
if max_wait_time == 0:
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
logger.log_notice('All SFP modules are ready')
|
||||
return True
|
||||
|
||||
def get_asic_temp(self):
|
||||
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
|
||||
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
|
||||
|
||||
def get_asic_temp_warning_threashold(self):
|
||||
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
|
||||
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
|
||||
def get_asic_temp_critical_threashold(self):
|
||||
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
|
||||
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
def update_single_module(self, sfp):
|
||||
try:
|
||||
presence = sfp.get_presence()
|
||||
pre_presence = self._sfp_status.get(sfp.sdk_index)
|
||||
if presence:
|
||||
temperature = sfp.get_temperature()
|
||||
if temperature == 0:
|
||||
warning_thresh = 0
|
||||
critical_thresh = 0
|
||||
fault = 0
|
||||
else:
|
||||
warning_thresh = sfp.get_temperature_warning_threashold()
|
||||
critical_thresh = sfp.get_temperature_critical_threashold()
|
||||
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
|
||||
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
|
||||
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
|
||||
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
|
||||
|
||||
hw_management_independent_mode_update.thermal_data_set_module(
|
||||
0, # ASIC index always 0 for now
|
||||
sfp.sdk_index + 1,
|
||||
temperature,
|
||||
critical_thresh,
|
||||
warning_thresh,
|
||||
fault
|
||||
)
|
||||
else:
|
||||
if pre_presence != presence:
|
||||
hw_management_independent_mode_update.thermal_data_clean_module(0, sfp.sdk_index + 1)
|
||||
|
||||
if pre_presence != presence:
|
||||
self._sfp_status[sfp.sdk_index] = presence
|
||||
except Exception as e:
|
||||
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
|
||||
hw_management_independent_mode_update.thermal_data_set_module(
|
||||
0, # ASIC index always 0 for now
|
||||
sfp.sdk_index + 1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
ERROR_READ_THERMAL_DATA
|
||||
)
|
||||
|
||||
def update_module(self):
|
||||
for sfp in self._sfp_list:
|
||||
self.update_single_module(sfp)
|
||||
|
||||
def update_asic(self):
|
||||
try:
|
||||
asic_temp = self.get_asic_temp()
|
||||
warn_threshold = self.get_asic_temp_warning_threashold()
|
||||
critical_threshold = self.get_asic_temp_critical_threashold()
|
||||
fault = 0
|
||||
if asic_temp is None:
|
||||
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
|
||||
asic_temp = warn_threshold
|
||||
fault = ERROR_READ_THERMAL_DATA
|
||||
|
||||
hw_management_independent_mode_update.thermal_data_set_asic(
|
||||
0, # ASIC index always 0 for now
|
||||
asic_temp,
|
||||
critical_threshold,
|
||||
warn_threshold,
|
||||
fault
|
||||
)
|
||||
except Exception as e:
|
||||
logger.log_error('Failed to update ASIC thermal data - {e}')
|
||||
hw_management_independent_mode_update.thermal_data_set_asic(
|
||||
0, # ASIC index always 0 for now
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
ERROR_READ_THERMAL_DATA
|
||||
)
|
@ -18,6 +18,7 @@ import ctypes
|
||||
import functools
|
||||
import subprocess
|
||||
import json
|
||||
import queue
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
@ -289,6 +290,60 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs):
|
||||
return False
|
||||
|
||||
|
||||
class TimerEvent:
|
||||
def __init__(self, interval, cb, repeat):
|
||||
self.interval = interval
|
||||
self._cb = cb
|
||||
self.repeat = repeat
|
||||
|
||||
def execute(self):
|
||||
self._cb()
|
||||
|
||||
|
||||
class Timer(threading.Thread):
|
||||
def __init__(self):
|
||||
super(Timer, self).__init__()
|
||||
self._timestamp_queue = queue.PriorityQueue()
|
||||
self._wait_event = threading.Event()
|
||||
self._stop_event = threading.Event()
|
||||
self._min_timestamp = None
|
||||
|
||||
def schedule(self, interval, cb, repeat=True, run_now=True):
|
||||
timer_event = TimerEvent(interval, cb, repeat)
|
||||
self.add_timer_event(timer_event, run_now)
|
||||
|
||||
def add_timer_event(self, timer_event, run_now=True):
|
||||
timestamp = time.time()
|
||||
if not run_now:
|
||||
timestamp += timer_event.interval
|
||||
|
||||
self._timestamp_queue.put_nowait((timestamp, timer_event))
|
||||
if self._min_timestamp is not None and timestamp < self._min_timestamp:
|
||||
self._wait_event.set()
|
||||
|
||||
def stop(self):
|
||||
if self.is_alive():
|
||||
self._wait_event.set()
|
||||
self._stop_event.set()
|
||||
self.join()
|
||||
|
||||
def run(self):
|
||||
while not self._stop_event.is_set():
|
||||
now = time.time()
|
||||
item = self._timestamp_queue.get()
|
||||
self._min_timestamp = item[0]
|
||||
if self._min_timestamp > now:
|
||||
self._wait_event.wait(self._min_timestamp - now)
|
||||
self._wait_event.clear()
|
||||
self._timestamp_queue.put(item)
|
||||
continue
|
||||
|
||||
timer_event = item[1]
|
||||
timer_event.execute()
|
||||
if timer_event.repeat:
|
||||
self.add_timer_event(timer_event, False)
|
||||
|
||||
|
||||
class DbUtils:
|
||||
lock = threading.Lock()
|
||||
db_instances = threading.local()
|
||||
|
@ -292,6 +292,46 @@ class TestSfp:
|
||||
assert sfp.get_transceiver_threshold_info()
|
||||
sfp.reinit()
|
||||
|
||||
@mock.patch('os.path.exists')
|
||||
@mock.patch('sonic_platform.utils.read_int_from_file')
|
||||
def test_get_temperature(self, mock_read, mock_exists):
|
||||
sfp = SFP(0)
|
||||
sfp.is_sw_control = mock.MagicMock(return_value=True)
|
||||
mock_exists.return_value = False
|
||||
assert sfp.get_temperature() == None
|
||||
|
||||
mock_exists.return_value = True
|
||||
assert sfp.get_temperature() == None
|
||||
|
||||
mock_read.return_value = None
|
||||
sfp.is_sw_control.return_value = False
|
||||
assert sfp.get_temperature() == None
|
||||
|
||||
mock_read.return_value = 448
|
||||
assert sfp.get_temperature() == 56.0
|
||||
|
||||
def test_get_temperature_threshold(self):
|
||||
sfp = SFP(0)
|
||||
sfp.is_sw_control = mock.MagicMock(return_value=True)
|
||||
assert sfp.get_temperature_warning_threashold() == 70.0
|
||||
assert sfp.get_temperature_critical_threashold() == 80.0
|
||||
|
||||
mock_api = mock.MagicMock()
|
||||
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
|
||||
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
|
||||
assert sfp.get_temperature_warning_threashold() == 70.0
|
||||
assert sfp.get_temperature_critical_threashold() == 80.0
|
||||
|
||||
from sonic_platform_base.sonic_xcvr.fields import consts
|
||||
mock_api.get_transceiver_thresholds_support.return_value = True
|
||||
mock_api.xcvr_eeprom = mock.MagicMock()
|
||||
mock_api.xcvr_eeprom.read = mock.MagicMock(return_value={
|
||||
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
|
||||
consts.TEMP_HIGH_WARNING_FIELD: 75.0
|
||||
})
|
||||
assert sfp.get_temperature_warning_threashold() == 75.0
|
||||
assert sfp.get_temperature_critical_threashold() == 85.0
|
||||
|
||||
@mock.patch('sonic_platform.utils.read_int_from_file')
|
||||
@mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode')
|
||||
@mock.patch('sonic_platform.utils.DbUtils.get_db_instance')
|
||||
|
@ -31,6 +31,7 @@ sys.path.insert(0, modules_path)
|
||||
import sonic_platform.chassis
|
||||
from sonic_platform.chassis import Chassis
|
||||
from sonic_platform.device_data import DeviceDataManager
|
||||
from sonic_platform.sfp import SFP
|
||||
|
||||
sonic_platform.chassis.extract_RJ45_ports_index = mock.MagicMock(return_value=[])
|
||||
|
||||
@ -148,23 +149,27 @@ class TestThermal:
|
||||
|
||||
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
|
||||
def test_sfp_thermal(self):
|
||||
from sonic_platform.thermal import initialize_sfp_thermal, THERMAL_NAMING_RULE
|
||||
thermal_list = initialize_sfp_thermal(0)
|
||||
from sonic_platform.thermal import THERMAL_NAMING_RULE
|
||||
sfp = SFP(0)
|
||||
thermal_list = sfp.get_all_thermals()
|
||||
assert len(thermal_list) == 1
|
||||
thermal = thermal_list[0]
|
||||
rule = THERMAL_NAMING_RULE['sfp thermals']
|
||||
start_index = rule.get('start_index', 1)
|
||||
assert thermal.get_name() == rule['name'].format(start_index)
|
||||
assert rule['temperature'].format(start_index) in thermal.temperature
|
||||
assert rule['high_threshold'].format(start_index) in thermal.high_threshold
|
||||
assert rule['high_critical_threshold'].format(start_index) in thermal.high_critical_threshold
|
||||
assert thermal.get_position_in_parent() == 1
|
||||
assert thermal.is_replaceable() == False
|
||||
sfp.get_temperature = mock.MagicMock(return_value=35.4)
|
||||
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
|
||||
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
|
||||
assert thermal.get_temperature() == 35.4
|
||||
assert thermal.get_high_threshold() == 70
|
||||
assert thermal.get_high_critical_threshold() == 80
|
||||
|
||||
@mock.patch('sonic_platform.utils.read_float_from_file')
|
||||
def test_get_temperature(self, mock_read):
|
||||
from sonic_platform.thermal import Thermal
|
||||
thermal = Thermal('test', 'temp_file', None, None, 1)
|
||||
thermal = Thermal('test', 'temp_file', None, None, None, None, 1000, 1)
|
||||
mock_read.return_value = 35727
|
||||
assert thermal.get_temperature() == 35.727
|
||||
|
||||
@ -177,7 +182,7 @@ class TestThermal:
|
||||
@mock.patch('sonic_platform.utils.read_float_from_file')
|
||||
def test_get_high_threshold(self, mock_read):
|
||||
from sonic_platform.thermal import Thermal
|
||||
thermal = Thermal('test', None, None, None, 1)
|
||||
thermal = Thermal('test', None, None, None, None, None, 1000, 1)
|
||||
assert thermal.get_high_threshold() is None
|
||||
|
||||
thermal.high_threshold = 'high_th_file'
|
||||
@ -193,7 +198,7 @@ class TestThermal:
|
||||
@mock.patch('sonic_platform.utils.read_float_from_file')
|
||||
def test_get_high_critical_threshold(self, mock_read):
|
||||
from sonic_platform.thermal import Thermal
|
||||
thermal = Thermal('test', None, None, None, 1)
|
||||
thermal = Thermal('test', None, None, None, None, None, 1000, 1)
|
||||
assert thermal.get_high_critical_threshold() is None
|
||||
|
||||
thermal.high_critical_threshold = 'high_th_file'
|
||||
|
@ -0,0 +1,128 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
|
||||
# Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
from sonic_platform import utils
|
||||
from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update
|
||||
from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \
|
||||
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
|
||||
mock_tc_config = """
|
||||
{
|
||||
"dev_parameters": {
|
||||
"asic": {
|
||||
"pwm_min": 20,
|
||||
"pwm_max": 100,
|
||||
"val_min": "!70000",
|
||||
"val_max": "!105000",
|
||||
"poll_time": 3
|
||||
},
|
||||
"module\\\\d+": {
|
||||
"pwm_min": 20,
|
||||
"pwm_max": 100,
|
||||
"val_min": 60000,
|
||||
"val_max": 80000,
|
||||
"poll_time": 20
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
class TestThermalUpdater:
|
||||
def test_load_tc_config_non_exists(self):
|
||||
updater = ThermalUpdater(None)
|
||||
updater.load_tc_config()
|
||||
assert updater._timer._timestamp_queue.qsize() == 2
|
||||
|
||||
def test_load_tc_config_mocked(self):
|
||||
updater = ThermalUpdater(None)
|
||||
mock_os_open = mock.mock_open(read_data=mock_tc_config)
|
||||
with mock.patch('sonic_platform.utils.open', mock_os_open):
|
||||
updater.load_tc_config()
|
||||
assert updater._timer._timestamp_queue.qsize() == 2
|
||||
|
||||
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_asic', mock.MagicMock())
|
||||
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_module', mock.MagicMock())
|
||||
@mock.patch('sonic_platform.thermal_updater.ThermalUpdater.wait_all_sfp_ready')
|
||||
@mock.patch('sonic_platform.utils.write_file')
|
||||
def test_start_stop(self, mock_write, mock_wait):
|
||||
mock_wait.return_value = True
|
||||
mock_sfp = mock.MagicMock()
|
||||
mock_sfp.sdk_index = 1
|
||||
updater = ThermalUpdater([mock_sfp])
|
||||
updater.start()
|
||||
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0)
|
||||
utils.wait_until(updater._timer.is_alive, timeout=5)
|
||||
|
||||
mock_write.reset_mock()
|
||||
updater.stop()
|
||||
assert not updater._timer.is_alive()
|
||||
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
|
||||
|
||||
mock_wait.return_value = False
|
||||
mock_write.reset_mock()
|
||||
updater.start()
|
||||
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1)
|
||||
updater.stop()
|
||||
|
||||
@mock.patch('sonic_platform.thermal_updater.time.sleep', mock.MagicMock())
|
||||
def test_wait_all_sfp_ready(self):
|
||||
mock_sfp = mock.MagicMock()
|
||||
mock_sfp.is_sw_control = mock.MagicMock(return_value=True)
|
||||
updater = ThermalUpdater([mock_sfp])
|
||||
assert updater.wait_all_sfp_ready()
|
||||
mock_sfp.is_sw_control.side_effect = Exception('')
|
||||
assert not updater.wait_all_sfp_ready()
|
||||
|
||||
@mock.patch('sonic_platform.utils.read_int_from_file')
|
||||
def test_update_asic(self, mock_read):
|
||||
mock_read.return_value = 8
|
||||
updater = ThermalUpdater(None)
|
||||
assert updater.get_asic_temp() == 1000
|
||||
assert updater.get_asic_temp_warning_threashold() == 1000
|
||||
assert updater.get_asic_temp_critical_threashold() == 1000
|
||||
updater.update_asic()
|
||||
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
|
||||
|
||||
mock_read.return_value = None
|
||||
assert updater.get_asic_temp() is None
|
||||
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
|
||||
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
|
||||
|
||||
def test_update_module(self):
|
||||
mock_sfp = mock.MagicMock()
|
||||
mock_sfp.sdk_index = 10
|
||||
mock_sfp.get_presence = mock.MagicMock(return_value=True)
|
||||
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
|
||||
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
|
||||
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
|
||||
updater = ThermalUpdater([mock_sfp])
|
||||
updater.update_module()
|
||||
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)
|
||||
|
||||
mock_sfp.get_temperature = mock.MagicMock(return_value=0.0)
|
||||
hw_management_independent_mode_update.reset_mock()
|
||||
updater.update_module()
|
||||
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0)
|
||||
|
||||
mock_sfp.get_presence = mock.MagicMock(return_value=False)
|
||||
updater.update_module()
|
||||
hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11)
|
@ -191,6 +191,26 @@ class TestUtils:
|
||||
mock_os_open = mock.mock_open(read_data='a:b')
|
||||
with mock.patch('sonic_platform.utils.open', mock_os_open):
|
||||
assert utils.read_key_value_file('some_file') == {'a':'b'}
|
||||
|
||||
mock_os_open = mock.mock_open(read_data='a=b')
|
||||
with mock.patch('sonic_platform.utils.open', mock_os_open):
|
||||
assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'}
|
||||
|
||||
def test_timer(self):
|
||||
timer = utils.Timer()
|
||||
timer.start()
|
||||
mock_cb_1000_run_now = mock.MagicMock()
|
||||
mock_cb_1000_run_future = mock.MagicMock()
|
||||
mock_cb_1_run_future_once = mock.MagicMock()
|
||||
mock_cb_1_run_future_repeat = mock.MagicMock()
|
||||
timer.schedule(1000, cb=mock_cb_1000_run_now, repeat=False, run_now=True)
|
||||
timer.schedule(1000, cb=mock_cb_1000_run_future, repeat=False, run_now=False)
|
||||
timer.schedule(1, cb=mock_cb_1_run_future_once, repeat=False, run_now=False)
|
||||
timer.schedule(1, cb=mock_cb_1_run_future_repeat, repeat=True, run_now=False)
|
||||
time.sleep(3)
|
||||
timer.stop()
|
||||
|
||||
mock_cb_1000_run_now.assert_called_once()
|
||||
mock_cb_1000_run_future.assert_not_called()
|
||||
mock_cb_1_run_future_once.assert_called_once()
|
||||
assert mock_cb_1_run_future_repeat.call_count > 1
|
||||
|
Loading…
Reference in New Issue
Block a user