[Mellanox] wait reset cause ready (#16722)

Why I did it
SONiC service determine-reboot-cause might run before driver creating reset cause files. In that case, the reset cause will be "Unknown". This PR introduces a wait mechanism to wait for reset cause sysfs files ready.

How I did it
/run/hw-management/config/reset_attr_ready is the file to indicate all reset cause files are ready. In chassis.get_reboot_cause function, it waits /run/hw-management/config/reset_attr_ready for up to 45 seconds.

How to verify it
Manual test on master/202211/202205
This commit is contained in:
Junchao-Mellanox 2023-10-04 09:58:31 +08:00 committed by GitHub
parent ada2d88d02
commit aedffd333b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 2 deletions

View File

@ -59,8 +59,9 @@ HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/'
#reboot cause related definitions #reboot cause related definitions
REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT
REBOOT_CAUSE_MAX_WAIT_TIME = 45
REBOOT_CAUSE_FILE_LENGTH = 1 REBOOT_CAUSE_CHECK_INTERVAL = 5
REBOOT_CAUSE_READY_FILE = '/run/hw-management/config/reset_attr_ready'
REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline" REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*" REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
@ -782,6 +783,16 @@ class Chassis(ChassisBase):
return 'fast-reboot' return 'fast-reboot'
return None return None
def _wait_reboot_cause_ready(self):
max_wait_time = REBOOT_CAUSE_MAX_WAIT_TIME
while max_wait_time > 0:
if utils.read_int_from_file(REBOOT_CAUSE_READY_FILE, log_func=None) == 1:
return True
time.sleep(REBOOT_CAUSE_CHECK_INTERVAL)
max_wait_time -= REBOOT_CAUSE_CHECK_INTERVAL
return False
def get_reboot_cause(self): def get_reboot_cause(self):
""" """
Retrieves the cause of the previous reboot Retrieves the cause of the previous reboot
@ -802,6 +813,10 @@ class Chassis(ChassisBase):
if reboot_cause: if reboot_cause:
return self.REBOOT_CAUSE_NON_HARDWARE, '' return self.REBOOT_CAUSE_NON_HARDWARE, ''
if not self._wait_reboot_cause_ready():
logger.log_error("Hardware reboot cause is not ready")
return self.REBOOT_CAUSE_NON_HARDWARE, ''
if not self.reboot_cause_initialized: if not self.reboot_cause_initialized:
self.initialize_reboot_cause() self.initialize_reboot_cause()

View File

@ -194,6 +194,7 @@ class TestChassis:
assert status is True assert status is True
assert 'sfp' in event_dict and not event_dict['sfp'] assert 'sfp' in event_dict and not event_dict['sfp']
@mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=True))
def test_reboot_cause(self): def test_reboot_cause(self):
from sonic_platform import utils from sonic_platform import utils
from sonic_platform.chassis import REBOOT_CAUSE_ROOT from sonic_platform.chassis import REBOOT_CAUSE_ROOT
@ -242,6 +243,22 @@ class TestChassis:
assert minor == value assert minor == value
mock_file_content[file_path] = 0 mock_file_content[file_path] = 0
@mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=False))
def test_reboot_cause_timeout(self):
chassis = Chassis()
major, minor = chassis.get_reboot_cause()
assert major == chassis.REBOOT_CAUSE_NON_HARDWARE
assert minor == ''
@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.chassis.time.sleep', mock.MagicMock())
def test_wait_reboot_cause_ready(self, mock_read_int):
mock_read_int.return_value = 1
chassis = Chassis()
assert chassis._wait_reboot_cause_ready()
mock_read_int.return_value = 0
assert not chassis._wait_reboot_cause_ready()
def test_parse_warmfast_reboot_from_proc_cmdline(self): def test_parse_warmfast_reboot_from_proc_cmdline(self):
chassis = Chassis() chassis = Chassis()
with mock.patch("builtins.open", mock.mock_open(read_data="SONIC_BOOT_TYPE=warm")): with mock.patch("builtins.open", mock.mock_open(read_data="SONIC_BOOT_TYPE=warm")):