[Mellanox] support new platform api, thermal and psu part (#3175)

* support new platform api, thermal and psu part
for psu, all APIs are supported.
for thermal, we support
  get_temperature,
  get_high_threshold
for the thermal sensors of cpu core, cpu pack, psu and sfp module
and get_temperature for the ambient thermal sensors around the asic, port, fan, comex and board.

* 1. address review comments
2. improve the handling of PSU inserting/removal
3. tolerance diverse psu thermal sensor file name conventions

* 1. adjust thermal code according to the latest version of hw-management
2. check power_good_status rather than whether file existing ahead of reading voltage, current and power of PSU
This commit is contained in:
Stephen Sun 2019-07-22 22:59:49 +08:00 committed by lguohan
parent 40c8bc14cd
commit 1d15022df7
3 changed files with 498 additions and 34 deletions

View File

@ -8,19 +8,19 @@
#
#############################################################################
import sys
try:
from sonic_platform_base.chassis_base import ChassisBase
from sonic_platform.psu import Psu
from sonic_platform.fan import Fan
from sonic_platform.fan import FAN_PATH
from sonic_platform.sfp import SFP
from sonic_platform.thermal import Thermal, initialize_thermals
from sonic_platform.watchdog import get_watchdog
from sonic_daemon_base.daemon_base import Logger
from eeprom import Eeprom
from os import listdir
from os.path import isfile, join
import sys
import io
import re
import subprocess
@ -64,12 +64,12 @@ COMPONENT_CPLD1 = "CPLD1"
COMPONENT_CPLD2 = "CPLD2"
# Global logger class instance
SYSLOG_IDENTIFIER = "mlnx-chassis"
SYSLOG_IDENTIFIER = "mlnx-chassis-api"
logger = Logger(SYSLOG_IDENTIFIER)
# magic code defnition for port number, qsfp port position of each hwsku
# port_position_tuple = (PORT_START, QSFP_PORT_START, PORT_END, PORT_IN_BLOCK, EEPROM_OFFSET)
hwsku_dict = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 0, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 3, 'ACS-MSN3700': 0, 'ACS-MSN3700C': 0, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0}
hwsku_dict_port = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 0, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 3, 'ACS-MSN3700': 0, 'ACS-MSN3700C': 0, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0}
port_position_tuple_list = [(0, 0, 31, 32, 1), (0, 0, 15, 16, 1), (0, 48, 55, 56, 1),(0, 18, 21, 22, 1)]
class Chassis(ChassisBase):
@ -78,9 +78,12 @@ class Chassis(ChassisBase):
def __init__(self):
super(Chassis, self).__init__()
# Initialize SKU name
self.sku_name = self._get_sku_name()
# Initialize PSU list
for index in range(MLNX_NUM_PSU):
psu = Psu(index)
psu = Psu(index, self.sku_name)
self._psu_list.append(psu)
# Initialize watchdog
@ -112,6 +115,9 @@ class Chassis(ChassisBase):
sfp_module = SFP(index, 'SFP')
self._sfp_list.append(sfp_module)
# Initialize thermals
initialize_thermals(self.sku_name, self._thermal_list, self._psu_list)
# Initialize EEPROM
self.eeprom = Eeprom()
@ -137,10 +143,13 @@ class Chassis(ChassisBase):
return num_of_fan, num_of_drawer
def _get_port_position_tuple_by_sku_name(self):
def _get_sku_name(self):
p = subprocess.Popen(GET_HWSKU_CMD, shell=True, stdout=subprocess.PIPE)
out, err = p.communicate()
position_tuple = port_position_tuple_list[hwsku_dict[out.rstrip('\n')]]
return out.rstrip('\n')
def _get_port_position_tuple_by_sku_name(self):
position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]]
return position_tuple
def get_base_mac(self):
@ -183,8 +192,8 @@ class Chassis(ChassisBase):
result = fileobj.read(len)
fileobj.close()
return result
except:
logger.log_warning("Fail to read file {}, maybe it doesn't exist".format(filename))
except Exception as e:
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
return ''
def _verify_reboot_cause(self, filename):

View File

@ -8,48 +8,113 @@
#
#############################################################################
import os.path
try:
import os.path
from sonic_platform_base.psu_base import PsuBase
from sonic_daemon_base.daemon_base import Logger
from sonic_platform.fan import Fan
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
# Global logger class instance
SYSLOG_IDENTIFIER = "mlnx-psu-api"
logger = Logger(SYSLOG_IDENTIFIER)
psu_list = []
PSU_CURRENT = "current"
PSU_VOLTAGE = "voltage"
PSU_POWER = "power"
# SKUs with unplugable PSUs:
# 1. don't have psuX_status and should be treated as always present
# 2. don't have voltage, current and power values
hwsku_dict_with_unplugable_psu = ['ACS-MSN2010', 'ACS-MSN2100']
# in most SKUs the file psuX_curr, psuX_volt and psuX_power contain current, voltage and power data respectively.
# but there are exceptions which will be handled by the following dictionary
hwsku_dict_psu = {'ACS-MSN3700': 1, 'ACS-MSN3700C': 1, 'ACS-MSN3800': 1}
psu_profile_list = [
# default filename convention
{
PSU_CURRENT : "power/psu{}_curr",
PSU_VOLTAGE : "power/psu{}_volt",
PSU_POWER : "power/psu{}_power"
},
# for 3700, 3700c, 3800
{
PSU_CURRENT : "power/psu{}_curr",
PSU_VOLTAGE : "power/psu{}_volt_out2",
PSU_POWER : "power/psu{}_power"
}
]
class Psu(PsuBase):
"""Platform-specific Psu class"""
def __init__(self, psu_index):
def __init__(self, psu_index, sku):
global psu_list
PsuBase.__init__(self)
# PSU is 1-based on Mellanox platform
self.index = psu_index + 1
psu_list.append(self.index)
self.psu_path = "/var/run/hw-management/thermal/"
self.psu_oper_status = "psu{}_pwr_status".format(self.index)
self.psu_presence = "psu{}_status".format(self.index)
if os.path.exists(os.path.join(self.psu_path, self.psu_presence)):
self.presence_file_exists = True
self.psu_path = "/var/run/hw-management/"
psu_oper_status = "thermal/psu{}_pwr_status".format(self.index)
#psu_oper_status should always be present for all SKUs
self.psu_oper_status = os.path.join(self.psu_path, psu_oper_status)
if sku in hwsku_dict_psu:
filemap = psu_profile_list[hwsku_dict_psu[sku]]
else:
self.presence_file_exists = False
filemap = psu_profile_list[0]
if sku in hwsku_dict_with_unplugable_psu:
self.always_presence = True
self.psu_voltage = None
self.psu_current = None
self.psu_power = None
self.psu_presence = None
else:
self.always_presence = False
psu_voltage = filemap[PSU_VOLTAGE].format(self.index)
psu_voltage = os.path.join(self.psu_path, psu_voltage)
self.psu_voltage = psu_voltage
psu_current = filemap[PSU_CURRENT].format(self.index)
psu_current = os.path.join(self.psu_path, psu_current)
self.psu_current = psu_current
psu_power = filemap[PSU_POWER].format(self.index)
psu_power = os.path.join(self.psu_path, psu_power)
self.psu_power = psu_power
psu_presence = "thermal/psu{}_status".format(self.index)
psu_presence = os.path.join(self.psu_path, psu_presence)
self.psu_presence = psu_presence
fan = Fan(psu_index, psu_index, True)
if fan.get_presence():
self._fan = fan
def get_status(self):
def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
"""
result = 0
try:
with open(filename, 'r') as fileobj:
result = int(fileobj.read())
except Exception as e:
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
return result
def get_powergood_status(self):
"""
Retrieves the operational status of power supply unit (PSU) defined
Returns:
bool: True if PSU is operating properly, False if not
"""
status = 0
try:
with open(os.path.join(self.psu_path, self.psu_oper_status), 'r') as power_status:
status = int(power_status.read())
except (ValueError, IOError):
status = 0
status = self._read_generic_file(os.path.join(self.psu_path, self.psu_oper_status), 0)
return status == 1
@ -60,15 +125,48 @@ class Psu(PsuBase):
Returns:
bool: True if PSU is present, False if not
"""
status = 0
if self.presence_file_exists:
try:
with open(os.path.join(self.psu_path, self.psu_presence), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0
if self.always_presence:
return self.always_presence
else:
status = self.index in psu_list
status = self._read_generic_file(self.psu_presence, 0)
return status == 1
return status == 1
def get_voltage(self):
"""
Retrieves current PSU voltage output
Returns:
A float number, the output voltage in volts,
e.g. 12.1
"""
if self.psu_voltage is not None and self.get_powergood_status():
voltage = self._read_generic_file(self.psu_voltage, 0)
return float(voltage) / 1000
else:
return None
def get_current(self):
"""
Retrieves present electric current supplied by PSU
Returns:
A float number, the electric current in amperes, e.g 15.4
"""
if self.psu_current is not None and self.get_powergood_status():
amperes = self._read_generic_file(self.psu_current, 0)
return float(amperes) / 1000
else:
return None
def get_power(self):
"""
Retrieves current energy supplied by PSU
Returns:
A float number, the power in watts, e.g. 302.6
"""
if self.psu_power is not None and self.get_powergood_status():
power = self._read_generic_file(self.psu_power, 0)
return float(power) / 1000000
else:
return None

View File

@ -0,0 +1,357 @@
#!/usr/bin/env python
#############################################################################
# Mellanox
#
# Module contains an implementation of SONiC Platform Base API and
# provides the thermals data which are available in the platform
#
#############################################################################
try:
from sonic_platform_base.thermal_base import ThermalBase
from sonic_daemon_base.daemon_base import Logger
from os import listdir
from os.path import isfile, join
import io
import os.path
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
# Global logger class instance
SYSLOG_IDENTIFIER = "mlnx-thermal-api"
logger = Logger(SYSLOG_IDENTIFIER)
THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core"
THERMAL_DEV_CATEGORY_CPU_PACK = "cpu_pack"
THERMAL_DEV_CATEGORY_MODULE = "module"
THERMAL_DEV_CATEGORY_PSU = "psu"
THERMAL_DEV_CATEGORY_GEARBOX = "gearbox"
THERMAL_DEV_CATEGORY_AMBIENT = "ambient"
THERMAL_DEV_ASIC_AMBIENT = "asic_amb"
THERMAL_DEV_FAN_AMBIENT = "fan_amb"
THERMAL_DEV_PORT_AMBIENT = "port_amb"
THERMAL_DEV_COMEX_AMBIENT = "comex_amb"
THERMAL_DEV_BOARD_AMBIENT = "board_amb"
THERMAL_API_GET_TEMPERATURE = "get_temperature"
THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold"
HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/"
thermal_api_handler_cpu_core = {
THERMAL_API_GET_TEMPERATURE:"cpu_core{}",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max"
}
thermal_api_handler_cpu_pack = {
THERMAL_API_GET_TEMPERATURE:"cpu_pack",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max"
}
thermal_api_handler_module = {
THERMAL_API_GET_TEMPERATURE:"temp_input_module{}",
THERMAL_API_GET_HIGH_THRESHOLD:"temp_crit_module{}"
}
thermal_api_handler_psu = {
THERMAL_API_GET_TEMPERATURE:"psu{}_temp",
THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max"
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}",
THERMAL_API_GET_HIGH_THRESHOLD:None
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic",
THERMAL_DEV_PORT_AMBIENT : "port_amb",
THERMAL_DEV_FAN_AMBIENT : "fan_amb",
THERMAL_DEV_COMEX_AMBIENT : "comex_amb",
THERMAL_DEV_BOARD_AMBIENT : "board_amb"
}
thermal_ambient_name = {
THERMAL_DEV_ASIC_AMBIENT : "Ambient ASIC Temp",
THERMAL_DEV_PORT_AMBIENT : "Ambient Port Side Temp",
THERMAL_DEV_FAN_AMBIENT : "Ambient Fan Side Temp",
THERMAL_DEV_COMEX_AMBIENT : "Ambient COMEX Temp",
THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp"
}
thermal_api_handlers = {
THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core,
THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack,
THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module,
THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu,
THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox
}
thermal_name = {
THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp",
THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp",
THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp",
THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp",
THERMAL_DEV_CATEGORY_GEARBOX : "Gearbox {} Temp"
}
thermal_device_categories_all = [
THERMAL_DEV_CATEGORY_CPU_CORE,
THERMAL_DEV_CATEGORY_CPU_PACK,
THERMAL_DEV_CATEGORY_MODULE,
THERMAL_DEV_CATEGORY_PSU,
THERMAL_DEV_CATEGORY_AMBIENT,
THERMAL_DEV_CATEGORY_GEARBOX
]
thermal_device_categories_singleton = [
THERMAL_DEV_CATEGORY_CPU_PACK,
THERMAL_DEV_CATEGORY_AMBIENT
]
thermal_api_names = [
THERMAL_API_GET_TEMPERATURE,
THERMAL_API_GET_HIGH_THRESHOLD
]
hwsku_dict_thermal = {'ACS-MSN2700': 0, 'LS-SN2700':0, 'ACS-MSN2740': 3, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 4, 'ACS-MSN3700': 5, 'ACS-MSN3700C': 6, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0, 'ACS-MSN3800': 7}
thermal_profile_list = [
# 2700
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 2100
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 16),
THERMAL_DEV_CATEGORY_PSU:(0, 0),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 2410
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 56),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 2740
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 2010
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 22),
THERMAL_DEV_CATEGORY_PSU:(0, 0),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,0),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT,
]
)
},
# 3700
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 3700c
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2),
THERMAL_DEV_CATEGORY_MODULE:(1, 32),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(0,0),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
# 3800
{
THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4),
THERMAL_DEV_CATEGORY_MODULE:(1, 64),
THERMAL_DEV_CATEGORY_PSU:(1, 2),
THERMAL_DEV_CATEGORY_CPU_PACK:(0,1),
THERMAL_DEV_CATEGORY_GEARBOX:(1,32),
THERMAL_DEV_CATEGORY_AMBIENT:(0,
[
THERMAL_DEV_ASIC_AMBIENT,
THERMAL_DEV_COMEX_AMBIENT,
THERMAL_DEV_PORT_AMBIENT,
THERMAL_DEV_FAN_AMBIENT
]
)
},
]
def initialize_thermals(sku, thermal_list, psu_list):
# create thermal objects for all categories of sensors
tp_index = hwsku_dict_thermal[sku]
thermal_profile = thermal_profile_list[tp_index]
for category in thermal_device_categories_all:
if category == THERMAL_DEV_CATEGORY_AMBIENT:
count, ambient_list = thermal_profile[category]
for ambient in ambient_list:
thermal = Thermal(category, ambient, True)
thermal_list.append(thermal)
else:
start, count = 0, 0
if category in thermal_profile:
start, count = thermal_profile[category]
if count == 0:
continue
if count == 1:
thermal = Thermal(category, 0, False)
thermal_list.append(thermal)
else:
if category == THERMAL_DEV_CATEGORY_PSU:
for index in range(count):
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off")
thermal_list.append(thermal)
else:
for index in range(count):
thermal = Thermal(category, start + index, True)
thermal_list.append(thermal)
class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None):
"""
index should be a string for category ambient and int for other categories
"""
if category == THERMAL_DEV_CATEGORY_AMBIENT:
self.name = thermal_ambient_name[index]
self.index = index
elif has_index:
self.name = thermal_name[category].format(index)
self.index = index
else:
self.name = thermal_name[category]
self.index = 0
self.category = category
self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE)
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.dependency = dependency
self.dependent_hint = hint
def get_name(self):
"""
Retrieves the name of the device
Returns:
string: The name of the device
"""
return self.name
def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
"""
result = None
try:
with open(filename, 'r') as fileobj:
result = fileobj.read()
except Exception as e:
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
return result
def _get_file_from_api(self, api_name):
if self.category == THERMAL_DEV_CATEGORY_AMBIENT:
if api_name == THERMAL_API_GET_TEMPERATURE:
filename = thermal_ambient_apis[self.index]
else:
return None
else:
handler = thermal_api_handlers[self.category][api_name]
if self.category in thermal_device_categories_singleton:
filename = handler
else:
filename = handler.format(self.index)
return join(HW_MGMT_THERMAL_ROOT, filename)
def get_temperature(self):
"""
Retrieves current temperature reading from thermal
Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.dependency and not self.dependency():
if self.dependent_hint:
hint = self.dependent_hint
else:
hint = "unknown reason"
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.temperature, 0)
if value_str is None:
return None
value_float = float(value_str)
return value_float / 1000.0
def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal
Returns:
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_threshold is None:
return None
value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None:
return None
value_float = float(value_str)
return value_float / 1000.0