[Mellanox] Auto correct PSU voltage threshold (WA) (#10394)

- Why I did it
There is a hardware bug that PSU voltage threshold sysfs returns incorrect value. The workaround is to call "sensor -s" to refresh it.

- How I did it
Call "sensor -s" when the threshold value is not incorrect and PSU is "DELTA 1100"

- How to verify it
Unit test and Manual test
This commit is contained in:
Junchao-Mellanox 2022-04-14 13:14:40 +08:00 committed by Judy Joseph
parent bf34b17d20
commit bdbb3d708d
10 changed files with 169 additions and 0 deletions

View File

@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
# Chassis fans
chip "mlxreg_fan-isa-*"

View File

@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
# Chassis fans
chip "mlxreg_fan-isa-*"

View File

@ -106,6 +106,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
@ -120,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
# Chassis fans
chip "mlxreg_fan-isa-*"

View File

@ -167,6 +167,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
@ -181,6 +184,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
# Chassis fans
chip "mlxreg_fan-isa-*"

View File

@ -123,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
@ -137,6 +140,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
# Chassis fans
chip "mlxreg_fan-isa-*"

View File

@ -24,8 +24,10 @@
try:
import os
import time
from sonic_platform_base.psu_base import PsuBase
from sonic_py_common.logger import Logger
from .device_data import DeviceDataManager
from .led import PsuLed, SharedLed, ComponentFaultyIndicator
from . import utils
from .vpd_parser import VpdParser
@ -411,6 +413,7 @@ class Psu(FixedPsu):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'max' in capability:
max_voltage = utils.read_int_from_file(self.psu_voltage_max, log_func=logger.log_info)
max_voltage = InvalidPsuVolWA.run(self, max_voltage, self.psu_voltage_max)
return float(max_voltage) / 1000
return None
@ -431,6 +434,7 @@ class Psu(FixedPsu):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'min' in capability:
min_voltage = utils.read_int_from_file(self.psu_voltage_min, log_func=logger.log_info)
min_voltage = InvalidPsuVolWA.run(self, min_voltage, self.psu_voltage_min)
return float(min_voltage) / 1000
return None
@ -448,3 +452,69 @@ class Psu(FixedPsu):
return float(power_max) / 1000000
else:
return None
class InvalidPsuVolWA:
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:
1. Check the PSU vendor, it should be Delta
2. Generate a temp sensor configuration file which contains a few set commands. Those set commands are the WA provided by low level team.
3. Call "sensors -s -c <tmp_conf_file>"
4. Wait for it to take effect
This issue is found on 3700, 3700c, 3800, 4600c
"""
INVALID_VOLTAGE_VALUE = 127998
EXPECT_VENDOR_NAME = 'DELTA'
EXPECT_CAPACITY = '1100'
EXPECT_PLATFORMS = ['x86_64-mlnx_msn3700-r0', 'x86_64-mlnx_msn3700c-r0', 'x86_64-mlnx_msn3800-r0', 'x86_64-mlnx_msn4600c-r0']
MFR_FIELD = 'MFR_NAME'
CAPACITY_FIELD = 'CAPACITY'
WAIT_TIME = 5
@classmethod
def run(cls, psu, threshold_value, threshold_file):
if threshold_value != cls.INVALID_VOLTAGE_VALUE:
# If the threshold value is not an invalid value, just return
return threshold_value
platform_name = DeviceDataManager.get_platform_name()
# Apply the WA to specified platforms
if platform_name not in cls.EXPECT_PLATFORMS:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but platform is {}'.format(psu.index, threshold_file, threshold_value, platform_name))
return threshold_value
# Check PSU vendor, make sure it is DELTA
vendor_name = psu.vpd_parser.get_entry_value(cls.MFR_FIELD)
if vendor_name != 'N/A' and vendor_name != cls.EXPECT_VENDOR_NAME:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but its vendor is {}'.format(psu.index, threshold_file, threshold_value, vendor_name))
return threshold_value
# Check PSU version, make sure it is 1100
capacity = psu.vpd_parser.get_entry_value(cls.CAPACITY_FIELD)
if capacity != 'N/A' and capacity != cls.EXPECT_CAPACITY:
logger.log_warning('PSU {} threshold file {} value {}, but its capacity is {}'.format(psu.index, threshold_file, threshold_value, capacity))
return threshold_value
# Run a sensor -s command to triger hardware to get the real threashold value
utils.run_command('sensor -s')
# Wait for the threshold value change
return cls.wait_set_done(threshold_file)
@classmethod
def wait_set_done(cls, threshold_file):
wait_time = cls.WAIT_TIME
while wait_time > 0:
value = utils.read_int_from_file(threshold_file, log_func=logger.log_info)
if value != cls.INVALID_VOLTAGE_VALUE:
return value
wait_time -= 1
time.sleep(1)
logger.log_error('sensor -s does not recover PSU threshold sensor after {} seconds'.format(cls.WAIT_TIME))
return None

View File

@ -194,3 +194,16 @@ def default_return(return_value, log_func=logger.log_debug):
return return_value
return _impl
return wrapper
def run_command(command):
"""
Utility function to run an shell command and return the output.
:param command: Shell command string.
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0].strip()
except Exception:
return None

View File

@ -24,6 +24,7 @@ logger = Logger()
SN_VPD_FIELD = "SN_VPD_FIELD"
PN_VPD_FIELD = "PN_VPD_FIELD"
REV_VPD_FIELD = "REV_VPD_FIELD"
MFR_VPD_FIELD = "MFR_NAME"
class VpdParser:
@ -82,3 +83,17 @@ class VpdParser:
logger.log_error("Fail to read revision: No key {} in VPD {}".format(REV_VPD_FIELD, self.vpd_file))
return 'N/A'
return self.vpd_data.get(REV_VPD_FIELD, 'N/A')
def get_entry_value(self, key):
"""
Retrieves an vpd entry of the device
Returns:
string: Vpd entry value of device
"""
if self._get_data() and key not in self.vpd_data:
logger.log_warning("Fail to read vpd info: No key {} in VPD {}".format(key, self.vpd_file))
return 'N/A'
return self.vpd_data.get(key, 'N/A')

View File

@ -116,3 +116,40 @@ class TestPsu:
assert psu.get_model() == 'MTEF-PSF-AC-C'
assert psu.get_serial() == 'MT1946X07684'
assert psu.get_revision() == 'A3'
assert psu.vpd_parser.get_entry_value('MFR_NAME') == 'DELTA'
@mock.patch('sonic_platform.utils.read_int_from_file', mock.MagicMock(return_value=9999))
@mock.patch('sonic_platform.utils.run_command')
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
@mock.patch('sonic_platform.vpd_parser.VpdParser.get_entry_value')
def test_psu_workaround(self, mock_get_entry_value, mock_get_platform_name, mock_run_command):
from sonic_platform.psu import InvalidPsuVolWA
psu = Psu(0)
# Threshold value is not InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
assert InvalidPsuVolWA.run(psu, 9999, '') == 9999
# Platform name is not in InvalidPsuVolWA.EXPECT_PLATFORMS
mock_get_platform_name.return_value = 'some platform'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
# PSU vendor is not InvalidPsuVolWA.EXPECT_VENDOR_NAME
vpd_info = {
InvalidPsuVolWA.MFR_FIELD: 'some psu',
InvalidPsuVolWA.CAPACITY_FIELD: 'some capacity'
}
def get_entry_value(key):
return vpd_info[key]
mock_get_entry_value.side_effect = get_entry_value
mock_get_platform_name.return_value = 'x86_64-mlnx_msn3700-r0'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
# PSU capacity is not InvalidPsuVolWA.EXPECT_CAPACITY
vpd_info[InvalidPsuVolWA.MFR_FIELD] = InvalidPsuVolWA.EXPECT_VENDOR_NAME
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
# Normal
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
mock_run_command.assert_called_with('sensor -s')

View File

@ -116,3 +116,7 @@ class TestUtils:
assert func() == 100
assert mock_log.call_count == 1
def test_run_command(self):
output = utils.run_command('ls')
assert output