Fix post-reboot errors in platforms without sonic_platform package (#6130)
Refactor determine-reboot cause code. Fix errors seen during determine-reboot-cause when sonic_platform package is not installed. Add error handling for healthd service when sonic_platform package is not installed. Tested on KVM where sonic_platform is not present, and the errors are not seen anymore in syslog.
This commit is contained in:
parent
9aeb0a1bb1
commit
e8da2ee975
@ -40,7 +40,8 @@ REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
|
||||
REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*"
|
||||
|
||||
REBOOT_CAUSE_UNKNOWN = "Unknown"
|
||||
|
||||
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
|
||||
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
|
||||
|
||||
# Global logger class instance
|
||||
sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)
|
||||
@ -98,43 +99,27 @@ def find_proc_cmdline_reboot_cause():
|
||||
|
||||
return proc_cmdline_reboot_cause
|
||||
|
||||
def get_reboot_cause_from_platform():
|
||||
# Until all platform vendors have provided sonic_platform packages,
|
||||
# if there is no sonic_platform package installed, we only provide
|
||||
# software-related reboot causes.
|
||||
|
||||
def find_hardware_reboot_cause():
|
||||
# Find hardware reboot cause using sonic_platform library
|
||||
try:
|
||||
import sonic_platform
|
||||
platform = sonic_platform.platform.Platform()
|
||||
chassis = platform.get_chassis()
|
||||
return chassis.get_reboot_cause()
|
||||
except ImportError as err:
|
||||
hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
|
||||
sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor))
|
||||
except ImportError:
|
||||
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
|
||||
hardware_reboot_cause_major, hardware_reboot_cause_minor = REBOOT_CAUSE_NON_HARDWARE, "N/A"
|
||||
|
||||
|
||||
def find_hardware_reboot_cause():
|
||||
hardware_reboot_cause = None
|
||||
|
||||
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
|
||||
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
|
||||
|
||||
hardware_reboot_cause_major, hardware_reboot_cause_minor = get_reboot_cause_from_platform()
|
||||
sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor))
|
||||
|
||||
if hardware_reboot_cause_major == REBOOT_CAUSE_NON_HARDWARE:
|
||||
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
|
||||
# contain any software-related reboot info. We will use it as the previous cause.
|
||||
pass
|
||||
elif hardware_reboot_cause_major == REBOOT_CAUSE_HARDWARE_OTHER:
|
||||
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
|
||||
else:
|
||||
hardware_reboot_cause = hardware_reboot_cause_major
|
||||
|
||||
if hardware_reboot_cause:
|
||||
sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
|
||||
if hardware_reboot_cause_major:
|
||||
sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause_major))
|
||||
else:
|
||||
sonic_logger.log_info("No reboot cause found from platform api")
|
||||
|
||||
return hardware_reboot_cause, hardware_reboot_cause_minor
|
||||
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
|
||||
return hardware_reboot_cause
|
||||
|
||||
|
||||
def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
|
||||
# resultant dictionary
|
||||
@ -175,14 +160,16 @@ def main():
|
||||
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
|
||||
|
||||
hardware_reboot_cause = None
|
||||
additional_reboot_info = None
|
||||
# This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
|
||||
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
|
||||
additional_reboot_info = "N/A"
|
||||
|
||||
# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
|
||||
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
|
||||
|
||||
# 2. Check if the previous reboot was caused by hardware
|
||||
# If yes, the hardware reboot cause will be treated as the reboot cause
|
||||
(hardware_reboot_cause, additional_reboot_info) = find_hardware_reboot_cause()
|
||||
hardware_reboot_cause = find_hardware_reboot_cause()
|
||||
|
||||
# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
|
||||
# reboot info. We will use it as the previous cause.
|
||||
@ -197,7 +184,7 @@ def main():
|
||||
# Else the software_reboot_cause will be treated as the reboot cause
|
||||
if proc_cmdline_reboot_cause is not None:
|
||||
previous_reboot_cause = software_reboot_cause
|
||||
elif hardware_reboot_cause is not None:
|
||||
elif hardware_reboot_cause is not REBOOT_CAUSE_NON_HARDWARE:
|
||||
previous_reboot_cause = hardware_reboot_cause
|
||||
else:
|
||||
previous_reboot_cause = software_reboot_cause
|
||||
|
@ -99,9 +99,8 @@ class TestDetermineRebootCause(object):
|
||||
assert result == "fast-reboot"
|
||||
|
||||
def test_find_hardware_reboot_cause(self):
|
||||
with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)):
|
||||
result = find_hardware_reboot_cause()
|
||||
assert result == ("Powerloss", None)
|
||||
result = find_hardware_reboot_cause()
|
||||
assert result == "Non-Hardware (N/A)"
|
||||
|
||||
def test_get_reboot_cause_dict_watchdog(self):
|
||||
reboot_cause_dict = get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG)
|
||||
@ -113,5 +112,5 @@ class TestDetermineRebootCause(object):
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
print("TREARDOWN")
|
||||
print("TEARDOWN")
|
||||
|
||||
|
@ -68,19 +68,22 @@ class HealthDaemon(DaemonBase):
|
||||
"""
|
||||
self.log_notice("Starting up...")
|
||||
|
||||
import sonic_platform.platform
|
||||
chassis = sonic_platform.platform.Platform().get_chassis()
|
||||
manager = HealthCheckerManager()
|
||||
if not manager.config.config_file_exists():
|
||||
self.log_warning("System health configuration file not found, exit...")
|
||||
return
|
||||
while 1:
|
||||
state, stat = manager.check(chassis)
|
||||
if state == HealthCheckerManager.STATE_RUNNING:
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
try:
|
||||
import sonic_platform.platform
|
||||
chassis = sonic_platform.platform.Platform().get_chassis()
|
||||
manager = HealthCheckerManager()
|
||||
if not manager.config.config_file_exists():
|
||||
self.log_warning("System health configuration file not found, exit...")
|
||||
return
|
||||
while 1:
|
||||
state, stat = manager.check(chassis)
|
||||
if state == HealthCheckerManager.STATE_RUNNING:
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
|
||||
if self.stop_event.wait(manager.config.interval):
|
||||
break
|
||||
if self.stop_event.wait(manager.config.interval):
|
||||
break
|
||||
except ImportError:
|
||||
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
||||
|
||||
self.deinit()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user