Fix post-reboot errors in platforms without sonic_platform package (#6130)

Refactor determine-reboot cause code. Fix errors seen during determine-reboot-cause when sonic_platform package is not installed.
Add error handling for healthd service when sonic_platform package is not installed.

Tested on KVM where sonic_platform is not present, and the errors are not seen anymore in syslog.
This commit is contained in:
Vaibhav Hemant Dixit 2020-12-17 12:01:42 -08:00 committed by GitHub
parent 9aeb0a1bb1
commit e8da2ee975
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 48 deletions

View File

@ -40,7 +40,8 @@ REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*" REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*"
REBOOT_CAUSE_UNKNOWN = "Unknown" REBOOT_CAUSE_UNKNOWN = "Unknown"
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
# Global logger class instance # Global logger class instance
sonic_logger = logger.Logger(SYSLOG_IDENTIFIER) sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)
@ -98,43 +99,27 @@ def find_proc_cmdline_reboot_cause():
return proc_cmdline_reboot_cause return proc_cmdline_reboot_cause
def get_reboot_cause_from_platform():
# Until all platform vendors have provided sonic_platform packages, def find_hardware_reboot_cause():
# if there is no sonic_platform package installed, we only provide # Find hardware reboot cause using sonic_platform library
# software-related reboot causes.
try: try:
import sonic_platform import sonic_platform
platform = sonic_platform.platform.Platform() platform = sonic_platform.platform.Platform()
chassis = platform.get_chassis() chassis = platform.get_chassis()
return chassis.get_reboot_cause() hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
except ImportError as err:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
def find_hardware_reboot_cause():
hardware_reboot_cause = None
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
hardware_reboot_cause_major, hardware_reboot_cause_minor = get_reboot_cause_from_platform()
sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)) sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor))
except ImportError:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
hardware_reboot_cause_major, hardware_reboot_cause_minor = REBOOT_CAUSE_NON_HARDWARE, "N/A"
if hardware_reboot_cause_major == REBOOT_CAUSE_NON_HARDWARE: if hardware_reboot_cause_major:
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause_major))
# contain any software-related reboot info. We will use it as the previous cause.
pass
elif hardware_reboot_cause_major == REBOOT_CAUSE_HARDWARE_OTHER:
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
else:
hardware_reboot_cause = hardware_reboot_cause_major
if hardware_reboot_cause:
sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
else: else:
sonic_logger.log_info("No reboot cause found from platform api") sonic_logger.log_info("No reboot cause found from platform api")
return hardware_reboot_cause, hardware_reboot_cause_minor hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
return hardware_reboot_cause
def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time): def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
# resultant dictionary # resultant dictionary
@ -175,14 +160,16 @@ def main():
os.remove(PREVIOUS_REBOOT_CAUSE_FILE) os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
hardware_reboot_cause = None hardware_reboot_cause = None
additional_reboot_info = None # This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
additional_reboot_info = "N/A"
# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline # 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause() proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
# 2. Check if the previous reboot was caused by hardware # 2. Check if the previous reboot was caused by hardware
# If yes, the hardware reboot cause will be treated as the reboot cause # If yes, the hardware reboot cause will be treated as the reboot cause
(hardware_reboot_cause, additional_reboot_info) = find_hardware_reboot_cause() hardware_reboot_cause = find_hardware_reboot_cause()
# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related # 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
# reboot info. We will use it as the previous cause. # reboot info. We will use it as the previous cause.
@ -197,7 +184,7 @@ def main():
# Else the software_reboot_cause will be treated as the reboot cause # Else the software_reboot_cause will be treated as the reboot cause
if proc_cmdline_reboot_cause is not None: if proc_cmdline_reboot_cause is not None:
previous_reboot_cause = software_reboot_cause previous_reboot_cause = software_reboot_cause
elif hardware_reboot_cause is not None: elif hardware_reboot_cause is not REBOOT_CAUSE_NON_HARDWARE:
previous_reboot_cause = hardware_reboot_cause previous_reboot_cause = hardware_reboot_cause
else: else:
previous_reboot_cause = software_reboot_cause previous_reboot_cause = software_reboot_cause

View File

@ -99,9 +99,8 @@ class TestDetermineRebootCause(object):
assert result == "fast-reboot" assert result == "fast-reboot"
def test_find_hardware_reboot_cause(self): def test_find_hardware_reboot_cause(self):
with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)):
result = find_hardware_reboot_cause() result = find_hardware_reboot_cause()
assert result == ("Powerloss", None) assert result == "Non-Hardware (N/A)"
def test_get_reboot_cause_dict_watchdog(self): def test_get_reboot_cause_dict_watchdog(self):
reboot_cause_dict = get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG) reboot_cause_dict = get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG)
@ -113,5 +112,5 @@ class TestDetermineRebootCause(object):
@classmethod @classmethod
def teardown_class(cls): def teardown_class(cls):
print("TREARDOWN") print("TEARDOWN")

View File

@ -68,6 +68,7 @@ class HealthDaemon(DaemonBase):
""" """
self.log_notice("Starting up...") self.log_notice("Starting up...")
try:
import sonic_platform.platform import sonic_platform.platform
chassis = sonic_platform.platform.Platform().get_chassis() chassis = sonic_platform.platform.Platform().get_chassis()
manager = HealthCheckerManager() manager = HealthCheckerManager()
@ -81,6 +82,8 @@ class HealthDaemon(DaemonBase):
if self.stop_event.wait(manager.config.interval): if self.stop_event.wait(manager.config.interval):
break break
except ImportError:
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
self.deinit() self.deinit()