[Mellanox]New platform api -- chassis part (#3082)

* new platform api, chassis part

* Inject mlnx mlx libs to platform monitor

* address the review comments

* remove some confusing naming.

* Adjust the minor cause to a more human-readable way when rebooted by firmware

* address review comments

* expose host dir /host/reboot-cause to pmon docker so that the reboot causing by user command can be identified

* 1. Revert "expose host dir /host/reboot-cause to pmon docker so that the reboot causing by user command can be identified"
Since the only hardware-causing reboot should be handled by get_reboot_cause and the logic of handling reboot cause is about to move to the host side, no need to mount this dir to pmon docker.
This reverts commit 3feb96869d.
2. adjust log output by using sonic_daemon_base.daemon_base.Logger.
3. remove the logic of verifying /host/reboot-cause/ files.
4. fix typo.

* implement get_firmware_version and adjust the interfaces regarding components' version retrieving according to the Azure/sonic-platform-common#34
This commit is contained in:
Stephen Sun 2019-07-04 19:29:58 +08:00 committed by liat-grozovik
parent 9a8202a39d
commit 82fb3a099d
3 changed files with 378 additions and 5 deletions

View File

@ -17,10 +17,14 @@ try:
from sonic_platform.fan import FAN_PATH
from sonic_platform.sfp import SFP
from sonic_platform.watchdog import get_watchdog
from sonic_daemon_base.daemon_base import Logger
from eeprom import Eeprom
from os import listdir
from os.path import isfile, join
import io
import re
import subprocess
import syslog
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
@ -28,6 +32,41 @@ MLNX_NUM_PSU = 2
GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku"
EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom'
EEPROM_CACHE_FILE = 'syseeprom_cache'
HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/'
#reboot cause related definitions
REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT
REBOOT_CAUSE_POWER_LOSS_FILE = 'reset_main_pwr_fail'
REBOOT_CAUSE_THERMAL_OVERLOAD_ASIC_FILE = 'reset_asic_thermal'
REBOOT_CAUSE_WATCHDOG_FILE = 'reset_hotswap_or_wd'
REBOOT_CAUSE_MLNX_FIRMWARE_RESET = 'reset_fw_reset'
REBOOT_CAUSE_FILE_LENGTH = 1
#version retrieving related definitions
CPLD_VERSION_ROOT = HWMGMT_SYSTEM_ROOT
CPLD1_VERSION_FILE = 'cpld1_version'
CPLD2_VERSION_FILE = 'cpld2_version'
CPLD_VERSION_MAX_LENGTH = 4
FW_QUERY_VERSION_COMMAND = 'mlxfwmanager --query'
BIOS_QUERY_VERSION_COMMAND = 'dmidecode -t 11'
#components definitions
COMPONENT_BIOS = "BIOS"
COMPONENT_FIRMWARE = "ASIC-FIRMWARE"
COMPONENT_CPLD1 = "CPLD1"
COMPONENT_CPLD2 = "CPLD2"
# Global logger class instance
SYSLOG_IDENTIFIER = "mlnx-chassis"
logger = Logger(SYSLOG_IDENTIFIER)
# magic code defnition for port number, qsfp port position of each hwsku
# port_position_tuple = (PORT_START, QSFP_PORT_START, PORT_END, PORT_IN_BLOCK, EEPROM_OFFSET)
hwsku_dict = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 0, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 3, 'ACS-MSN3700': 0, 'ACS-MSN3700C': 0, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0}
@ -37,7 +76,7 @@ class Chassis(ChassisBase):
"""Platform-specific Chassis class"""
def __init__(self):
ChassisBase.__init__(self)
super(Chassis, self).__init__()
# Initialize PSU list
for index in range(MLNX_NUM_PSU):
@ -46,7 +85,7 @@ class Chassis(ChassisBase):
# Initialize watchdog
self._watchdog = get_watchdog()
# Initialize FAN list
multi_rotor_in_drawer = False
num_of_fan, num_of_drawer = self._extract_num_of_fans_and_fan_drawers()
@ -65,14 +104,23 @@ class Chassis(ChassisBase):
self.QSFP_PORT_START = port_position_tuple[1]
self.PORT_END = port_position_tuple[2]
self.PORTS_IN_BLOCK = port_position_tuple[3]
for index in range(self.PORT_START, self.PORT_END + 1):
if index in range(QSFP_PORT_START, self.PORTS_IN_BLOCK + 1):
if index in range(self.QSFP_PORT_START, self.PORTS_IN_BLOCK + 1):
sfp_module = SFP(index, 'QSFP')
else:
sfp_module = SFP(index, 'SFP')
self._sfp_list.append(sfp_module)
# Initialize EEPROM
self.eeprom = Eeprom()
# Initialize component list
self._component_name_list.append(COMPONENT_BIOS)
self._component_name_list.append(COMPONENT_FIRMWARE)
self._component_name_list.append(COMPONENT_CPLD1)
self._component_name_list.append(COMPONENT_CPLD2)
def _extract_num_of_fans_and_fan_drawers(self):
num_of_fan = 0
num_of_drawer = 0
@ -95,5 +143,178 @@ class Chassis(ChassisBase):
position_tuple = port_position_tuple_list[hwsku_dict[out.rstrip('\n')]]
return position_tuple
def get_base_mac(self):
"""
Retrieves the base MAC address for the chassis
Returns:
A string containing the MAC address in the format
'XX:XX:XX:XX:XX:XX'
"""
return self.eeprom.get_base_mac()
def get_serial_number(self):
"""
Retrieves the hardware serial number for the chassis
Returns:
A string containing the hardware serial number for this chassis.
"""
return self.eeprom.get_serial_number()
def get_system_eeprom_info(self):
"""
Retrieves the full content of system EEPROM information for the chassis
Returns:
A dictionary where keys are the type code defined in
OCP ONIE TlvInfo EEPROM format and values are their corresponding
values.
"""
return self.eeprom.get_system_eeprom_info()
def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
"""
result = ''
try:
fileobj = io.open(filename)
result = fileobj.read(len)
fileobj.close()
return result
except:
logger.log_warning("Fail to read file {}, maybe it doesn't exist".format(filename))
return ''
def _verify_reboot_cause(self, filename):
'''
Open and read the reboot cause file in
/var/run/hwmanagement/system (which is defined as REBOOT_CAUSE_ROOT)
If a reboot cause file doesn't exists, returns '0'.
'''
return bool(int(self._read_generic_file(join(REBOOT_CAUSE_ROOT, filename), REBOOT_CAUSE_FILE_LENGTH).rstrip('\n')))
def get_reboot_cause(self):
"""
Retrieves the cause of the previous reboot
Returns:
A tuple (string, string) where the first element is a string
containing the cause of the previous reboot. This string must be
one of the predefined strings in this class. If the first string
is "REBOOT_CAUSE_HARDWARE_OTHER", the second string can be used
to pass a description of the reboot cause.
"""
#read reboot causes files in the following order
minor_cause = ''
if self._verify_reboot_cause(REBOOT_CAUSE_POWER_LOSS_FILE):
major_cause = self.REBOOT_CAUSE_POWER_LOSS
elif self._verify_reboot_cause(REBOOT_CAUSE_THERMAL_OVERLOAD_ASIC_FILE):
major_cause = self.REBOOT_CAUSE_THERMAL_OVERLOAD_ASIC
elif self._verify_reboot_cause(REBOOT_CAUSE_WATCHDOG_FILE):
major_cause = self.REBOOT_CAUSE_WATCHDOG
else:
major_cause = self.REBOOT_CAUSE_HARDWARE_OTHER
if self._verify_reboot_cause(REBOOT_CAUSE_MLNX_FIRMWARE_RESET):
minor_cause = "Reset by ASIC firmware"
else:
major_cause = self.REBOOT_CAUSE_NON_HARDWARE
return major_cause, minor_cause
def _get_cpld_version(self, version_file):
cpld_version = self._read_generic_file(join(CPLD_VERSION_ROOT, version_file), CPLD_VERSION_MAX_LENGTH)
return cpld_version.rstrip('\n')
def _get_command_result(self, cmdline):
try:
proc = subprocess.Popen(cmdline, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT)
stdout = proc.communicate()[0]
proc.wait()
result = stdout.rstrip('\n')
except OSError, e:
result = ''
return result
def _get_firmware_version(self):
"""
firmware version is retrieved via command 'mlxfwmanager --query'
which should return result in the following convention
admin@mtbc-sonic-01-2410:~$ sudo mlxfwmanager --query
Querying Mellanox devices firmware ...
Device #1:
----------
Device Type: Spectrum
Part Number: MSN2410-CxxxO_Ax_Bx
Description: Spectrum based 25GbE/100GbE 1U Open Ethernet switch with ONIE; 48 SFP28 ports; 8 QSFP28 ports; x86 dual core; RoHS6
PSID: MT_2860111033
PCI Device Name: /dev/mst/mt52100_pci_cr0
Base MAC: 98039bf3f500
Versions: Current Available
FW ***13.2000.1140***N/A
Status: No matching image found
By using regular expression '(Versions:.*\n[\s]+FW[\s]+)([\S]+)',
we can extrace the version which is marked with *** in the above context
"""
fw_ver_str = self._get_command_result(FW_QUERY_VERSION_COMMAND)
try:
m = re.search('(Versions:.*\n[\s]+FW[\s]+)([\S]+)', fw_ver_str)
result = m.group(2)
except :
result = ''
return result
def _get_bios_version(self):
"""
BIOS version is retrieved via command 'dmidecode -t 11'
which should return result in the following convention
# dmidecode 3.0
Getting SMBIOS data from sysfs.
SMBIOS 2.7 present.
Handle 0x0022, DMI type 11, 5 bytes
OEM Strings
String 1:*0ABZS017_02.02.002*
String 2: To Be Filled By O.E.M.
By using regular expression 'OEM[\s]*Strings\n[\s]*String[\s]*1:[\s]*([0-9a-zA-Z_\.]*)'
we can extrace the version string which is marked with * in the above context
"""
bios_ver_str = self._get_command_result(BIOS_QUERY_VERSION_COMMAND)
try:
m = re.search('OEM[\s]*Strings\n[\s]*String[\s]*1:[\s]*([0-9a-zA-Z_\.]*)', bios_ver_str)
result = m.group(1)
except:
result = ''
return result
def get_firmware_version(self, component_name):
"""
Retrieves platform-specific hardware/firmware versions for chassis
componenets such as BIOS, CPLD, FPGA, etc.
Args:
component_name: A string, the component name.
Returns:
A string containing platform-specific component versions
"""
if component_name in self._component_name_list :
if component_name == COMPONENT_BIOS:
return self._get_bios_version()
elif component_name == COMPONENT_CPLD1:
return self._get_cpld_version(CPLD1_VERSION_FILE)
elif component_name == COMPONENT_CPLD2:
return self._get_cpld_version(CPLD2_VERSION_FILE)
elif component_name == COMPONENT_FIRMWARE:
return self._get_firmware_version()
return None

View File

@ -0,0 +1,149 @@
#!/usr/bin/env python
#############################################################################
# Mellanox
#
# Module contains an implementation of SONiC Platform Base API and
# provides the eeprom information which are available in the platform
#
#############################################################################
import exceptions
import os
import sys
import re
from cStringIO import StringIO
try:
from sonic_platform_base.sonic_eeprom import eeprom_tlvinfo
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
#
# CACHE_XXX stuffs are supposted to be moved to the base classes
# since they are common for all vendors
# they are defined in decode-syseeprom which might be removed in the future
# currently we just copy them here
#
CACHE_ROOT = '/var/cache/sonic/decode-syseeprom'
CACHE_FILE = 'syseeprom_cache'
#
# this is mlnx-specific
# should this be moved to chass.py or here, which better?
#
EEPROM_SYMLINK = "/var/run/hw-management/eeprom/vpd_info"
class Eeprom(eeprom_tlvinfo.TlvInfoDecoder):
RETRIES = 3
EEPROM_DECODE_HEADLINES = 6
EEPROM_DECODE_MAXITEM = 3
EEPROM_DECODE_OFFSET = 0
EEPROM_DECODE_CONTENT = 2
def __init__(self):
for attempt in range(self.RETRIES):
if not os.path.islink(EEPROM_SYMLINK):
time.sleep(1)
else:
break
if not (os.path.exists(EEPROM_SYMLINK) \
or os.path.isfile(os.path.join(CACHE_ROOT, CACHE_FILE))):
log_error("Nowhere to read syseeprom from! No symlink or cache file found")
raise RuntimeError("No syseeprom symlink or cache file found")
self.eeprom_path = EEPROM_SYMLINK
super(Eeprom, self).__init__(self.eeprom_path, 0, '', True)
self._eeprom_loaded = False
self._load_eeprom()
self._eeprom_loaded = True
def _load_eeprom(self):
if not os.path.exists(CACHE_ROOT):
try:
os.makedirs(CACHE_ROOT)
except:
pass
try:
self.set_cache_name(os.path.join(CACHE_ROOT, CACHE_FILE))
except:
pass
eeprom = self.read_eeprom()
if eeprom is None :
return 0
try:
self.update_cache(eeprom)
except:
pass
self._base_mac = self.mgmtaddrstr(eeprom)
if self._base_mac == None:
self._base_mac = "Undefined."
self._serial_str = self.serial_number_str(eeprom)
if self._serial_str == None:
self._serial_str = "Undefined."
original_stdout = sys.stdout
sys.stdout = StringIO()
self.decode_eeprom(eeprom)
decode_output = sys.stdout.getvalue()
sys.stdout = original_stdout
#parse decode_output into a dictionary
decode_output.replace('\0', '')
lines = decode_output.split('\n')
lines = lines[self.EEPROM_DECODE_HEADLINES:]
self._eeprom_info_dict = dict()
for line in lines:
try:
match = re.search('(0x[0-9a-fA-F]{2})([\s]+[\S]+[\s]+)([\S]+)', line)
if match is not None:
idx = match.group(1)
value = match.group(3).rstrip('\0')
self._eeprom_info_dict[idx] = value
except:
pass
return 0
def get_base_mac(self):
"""
Retrieves the base MAC address for the chassis
Returns:
A string containing the MAC address in the format
'XX:XX:XX:XX:XX:XX'
"""
if not self._eeprom_loaded:
self._load_eeprom()
return self._base_mac
def get_serial_number(self):
"""
Retrieves the hardware serial number for the chassis
Returns:
A string containing the hardware serial number for this chassis.
"""
if not self._eeprom_loaded:
self._load_eeprom()
return self._serial_str
def get_system_eeprom_info(self):
"""
Retrieves the full content of system EEPROM information for the chassis
Returns:
A dictionary where keys are the type code defined in
OCP ONIE TlvInfo EEPROM format and values are their corresponding
values.
"""
if not self._eeprom_loaded:
self._load_eeprom()
return self._eeprom_info_dict

View File

@ -25,3 +25,6 @@ $(SYNCD)_RDEPENDS += $(MLNX_SAI)
# Inject mlnx sdk libs to platform monitor
$(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(APPLIBS) $(SX_COMPLIB) $(SXD_LIBS) $(SX_GEN_UTILS) $(PYTHON_SDK_API) $(APPLIBS_DEV) $(SX_COMPLIB_DEV) $(SXD_LIBS_DEV) $(SX_GEN_UTILS_DEV)
# Inject mlnx mlx libs to platform monitor
$(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(MFT)