[Mellanox] Fix issue: psu might use wrong voltage sysfs which causes invalid voltage value (#10231)

- Why I did it
Fix issue: psu might use wrong voltage sysfs which causes invalid voltage value. The flow is like:

1. User power off a PSU
2. All sysfs files related to this PSU are removed
3. User did a reboot/config reload
4. PSU will use wrong sysfs as voltage node

- How I did it
Always try find an existing sysfs.

- How to verify it
Manual test
This commit is contained in:
Junchao-Mellanox 2022-03-20 16:34:04 +08:00 committed by Judy Joseph
parent ef9df9edb5
commit 2773d29220
2 changed files with 52 additions and 11 deletions

View File

@ -200,16 +200,10 @@ class Psu(FixedPsu):
def __init__(self, psu_index):
super(Psu, self).__init__(psu_index)
psu_voltage_out2 = os.path.join(PSU_PATH, "power/psu{}_volt_out2".format(self.index))
psu_voltage = os.path.join(PSU_PATH, "power/psu{}_volt".format(self.index))
# Workaround for psu voltage sysfs file as the file name differs among platforms
if os.path.exists(psu_voltage_out2):
self.psu_voltage = psu_voltage_out2
else:
self.psu_voltage = psu_voltage
self.psu_voltage_min = self.psu_voltage + "_min"
self.psu_voltage_max = self.psu_voltage + "_max"
self.psu_voltage_capability = self.psu_voltage + "_capability"
self._psu_voltage = None
self._psu_voltage_min = None
self._psu_voltage_max = None
self._psu_voltage_capability = None
self.psu_current = os.path.join(PSU_PATH, self.PSU_CURRENT.format(self.index))
self.psu_power = os.path.join(PSU_PATH, self.PSU_POWER.format(self.index))
@ -228,6 +222,47 @@ class Psu(FixedPsu):
from .thermal import initialize_psu_thermal
self._thermal_list = initialize_psu_thermal(psu_index, self.get_power_available_status)
@property
def psu_voltage(self):
if not self._psu_voltage:
psu_voltage_out = os.path.join(PSU_PATH, "power/psu{}_volt_out2".format(self.index))
if os.path.exists(psu_voltage_out):
self._psu_voltage = psu_voltage_out
else:
psu_voltage_out = os.path.join(PSU_PATH, "power/psu{}_volt".format(self.index))
if os.path.exists(psu_voltage_out):
self._psu_voltage = psu_voltage_out
return self._psu_voltage
@property
def psu_voltage_min(self):
if not self._psu_voltage_min:
psu_voltage = self.psu_voltage
if psu_voltage:
self._psu_voltage_min = psu_voltage + "_min"
return self._psu_voltage_min
@property
def psu_voltage_max(self):
if not self._psu_voltage_max:
psu_voltage = self.psu_voltage
if psu_voltage:
self._psu_voltage_max = psu_voltage + "_max"
return self._psu_voltage_max
@property
def psu_voltage_capability(self):
if not self._psu_voltage_capability:
psu_voltage = self.psu_voltage
if psu_voltage:
self._psu_voltage_capability = psu_voltage + "_capability"
return self._psu_voltage_capability
def get_model(self):
"""
Retrieves the model number (or part number) of the device
@ -272,7 +307,7 @@ class Psu(FixedPsu):
A float number, the output voltage in volts,
e.g. 12.1
"""
if self.get_powergood_status():
if self.get_powergood_status() and self.psu_voltage:
# TODO: should we put log_func=None here? If not do this, when a PSU is back to power, some PSU related
# sysfs may not ready, read_int_from_file would encounter exception and log an error.
voltage = utils.read_int_from_file(self.psu_voltage, log_func=logger.log_info)

View File

@ -50,6 +50,7 @@ class TestPsu:
assert psu.get_temperature() is None
assert psu.get_temperature_high_threshold() is None
@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
def test_psu(self):
psu = Psu(0)
assert len(psu._fan_list) == 1
@ -58,6 +59,8 @@ class TestPsu:
psu.psu_presence: 1,
psu.psu_oper_status: 1,
psu.psu_voltage: 10234,
psu.psu_voltage_min: 9000,
psu.psu_voltage_max: 12000,
psu.psu_current: 20345,
psu.psu_power: 30456,
psu.psu_temp: 40567,
@ -68,6 +71,7 @@ class TestPsu:
return mock_sysfs_content[file_path]
utils.read_int_from_file = mock_read_int_from_file
utils.read_str_from_file = mock.MagicMock(return_value='min max')
assert psu.get_presence() is True
mock_sysfs_content[psu.psu_presence] = 0
assert psu.get_presence() is False
@ -84,6 +88,8 @@ class TestPsu:
mock_sysfs_content[psu.psu_oper_status] = 1
assert psu.get_voltage() == 10.234
assert psu.get_voltage_high_threshold() == 12.0
assert psu.get_voltage_low_threshold() == 9.0
assert psu.get_current() == 20.345
assert psu.get_power() == 0.030456
assert psu.get_temperature() == 40.567