From e8da2ee97549b16a95abe5adb7a97126b082edc8 Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Thu, 17 Dec 2020 12:01:42 -0800 Subject: [PATCH] Fix post-reboot errors in platforms without sonic_platform package (#6130) Refactor determine-reboot cause code. Fix errors seen during determine-reboot-cause when sonic_platform package is not installed. Add error handling for healthd service when sonic_platform package is not installed. Tested on KVM where sonic_platform is not present, and the errors are not seen anymore in syslog. --- .../scripts/determine-reboot-cause | 51 +++++++------------ .../tests/determine-reboot-cause_test.py | 7 ++- src/system-health/scripts/healthd | 27 +++++----- 3 files changed, 37 insertions(+), 48 deletions(-) diff --git a/src/sonic-host-services/scripts/determine-reboot-cause b/src/sonic-host-services/scripts/determine-reboot-cause index dcce6182e3..501b279ec1 100755 --- a/src/sonic-host-services/scripts/determine-reboot-cause +++ b/src/sonic-host-services/scripts/determine-reboot-cause @@ -40,7 +40,8 @@ REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*" REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*" REBOOT_CAUSE_UNKNOWN = "Unknown" - +REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware" +REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other" # Global logger class instance sonic_logger = logger.Logger(SYSLOG_IDENTIFIER) @@ -98,43 +99,27 @@ def find_proc_cmdline_reboot_cause(): return proc_cmdline_reboot_cause -def get_reboot_cause_from_platform(): - # Until all platform vendors have provided sonic_platform packages, - # if there is no sonic_platform package installed, we only provide - # software-related reboot causes. + +def find_hardware_reboot_cause(): + # Find hardware reboot cause using sonic_platform library try: import sonic_platform platform = sonic_platform.platform.Platform() chassis = platform.get_chassis() - return chassis.get_reboot_cause() - except ImportError as err: + hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause() + sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)) + except ImportError: sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.") + hardware_reboot_cause_major, hardware_reboot_cause_minor = REBOOT_CAUSE_NON_HARDWARE, "N/A" - -def find_hardware_reboot_cause(): - hardware_reboot_cause = None - - REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other" - REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware" - - hardware_reboot_cause_major, hardware_reboot_cause_minor = get_reboot_cause_from_platform() - sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)) - - if hardware_reboot_cause_major == REBOOT_CAUSE_NON_HARDWARE: - # The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will - # contain any software-related reboot info. We will use it as the previous cause. - pass - elif hardware_reboot_cause_major == REBOOT_CAUSE_HARDWARE_OTHER: - hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor) - else: - hardware_reboot_cause = hardware_reboot_cause_major - - if hardware_reboot_cause: - sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause)) + if hardware_reboot_cause_major: + sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause_major)) else: sonic_logger.log_info("No reboot cause found from platform api") - return hardware_reboot_cause, hardware_reboot_cause_minor + hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor) + return hardware_reboot_cause + def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time): # resultant dictionary @@ -175,14 +160,16 @@ def main(): os.remove(PREVIOUS_REBOOT_CAUSE_FILE) hardware_reboot_cause = None - additional_reboot_info = None + # This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides + # any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE + additional_reboot_info = "N/A" # 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause() # 2. Check if the previous reboot was caused by hardware # If yes, the hardware reboot cause will be treated as the reboot cause - (hardware_reboot_cause, additional_reboot_info) = find_hardware_reboot_cause() + hardware_reboot_cause = find_hardware_reboot_cause() # 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related # reboot info. We will use it as the previous cause. @@ -197,7 +184,7 @@ def main(): # Else the software_reboot_cause will be treated as the reboot cause if proc_cmdline_reboot_cause is not None: previous_reboot_cause = software_reboot_cause - elif hardware_reboot_cause is not None: + elif hardware_reboot_cause is not REBOOT_CAUSE_NON_HARDWARE: previous_reboot_cause = hardware_reboot_cause else: previous_reboot_cause = software_reboot_cause diff --git a/src/sonic-host-services/tests/determine-reboot-cause_test.py b/src/sonic-host-services/tests/determine-reboot-cause_test.py index 9cef2aa30d..31f36dd130 100644 --- a/src/sonic-host-services/tests/determine-reboot-cause_test.py +++ b/src/sonic-host-services/tests/determine-reboot-cause_test.py @@ -99,9 +99,8 @@ class TestDetermineRebootCause(object): assert result == "fast-reboot" def test_find_hardware_reboot_cause(self): - with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)): - result = find_hardware_reboot_cause() - assert result == ("Powerloss", None) + result = find_hardware_reboot_cause() + assert result == "Non-Hardware (N/A)" def test_get_reboot_cause_dict_watchdog(self): reboot_cause_dict = get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG) @@ -113,5 +112,5 @@ class TestDetermineRebootCause(object): @classmethod def teardown_class(cls): - print("TREARDOWN") + print("TEARDOWN") diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd index 00078d0c3e..77faf54943 100644 --- a/src/system-health/scripts/healthd +++ b/src/system-health/scripts/healthd @@ -68,19 +68,22 @@ class HealthDaemon(DaemonBase): """ self.log_notice("Starting up...") - import sonic_platform.platform - chassis = sonic_platform.platform.Platform().get_chassis() - manager = HealthCheckerManager() - if not manager.config.config_file_exists(): - self.log_warning("System health configuration file not found, exit...") - return - while 1: - state, stat = manager.check(chassis) - if state == HealthCheckerManager.STATE_RUNNING: - self._process_stat(chassis, manager.config, stat) + try: + import sonic_platform.platform + chassis = sonic_platform.platform.Platform().get_chassis() + manager = HealthCheckerManager() + if not manager.config.config_file_exists(): + self.log_warning("System health configuration file not found, exit...") + return + while 1: + state, stat = manager.check(chassis) + if state == HealthCheckerManager.STATE_RUNNING: + self._process_stat(chassis, manager.config, stat) - if self.stop_event.wait(manager.config.interval): - break + if self.stop_event.wait(manager.config.interval): + break + except ImportError: + self.log_warning("sonic_platform package not installed. Cannot start system-health daemon") self.deinit()