From 42e29246b9b7a336a2313ddfd74972d3ac70c260 Mon Sep 17 00:00:00 2001 From: Sujin Kang Date: Mon, 22 Aug 2022 10:52:01 -0700 Subject: [PATCH] Add the hw reboot cause if it happened during a software reboot (#11792) * Add the hw reboot cause if it happened during a software reboot * fix the unknown software reboot cause --- .../scripts/determine-reboot-cause | 68 ++++++++++++++----- .../tests/determine-reboot-cause_test.py | 63 ++++++++++++++++- 2 files changed, 112 insertions(+), 19 deletions(-) diff --git a/src/sonic-host-services/scripts/determine-reboot-cause b/src/sonic-host-services/scripts/determine-reboot-cause index 1408ad0e29..754541edb6 100755 --- a/src/sonic-host-services/scripts/determine-reboot-cause +++ b/src/sonic-host-services/scripts/determine-reboot-cause @@ -122,7 +122,12 @@ def find_hardware_reboot_cause(): else: sonic_logger.log_info("No reboot cause found from platform api") - hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor) + hardware_reboot_cause_minor_str = "" + if hardware_reboot_cause_minor: + hardware_reboot_cause_minor_str = " ({})".format(hardware_reboot_cause_minor) + + hardware_reboot_cause = hardware_reboot_cause_major + hardware_reboot_cause_minor_str + return hardware_reboot_cause @@ -158,6 +163,50 @@ def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time): return reboot_cause_dict +def determine_reboot_cause(): + # This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides + # any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE + additional_reboot_info = "N/A" + + # 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline + proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause() + + # 2. Check if the previous reboot was caused by hardware + # If yes, the hardware reboot cause will be treated as the reboot cause + hardware_reboot_cause = find_hardware_reboot_cause() + + # 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related + # reboot info. We will use it as the previous cause. + software_reboot_cause = find_software_reboot_cause() + + # The main decision logic of the reboot cause: + # If there is a valid hardware reboot cause indicated by platform API, + # check the software reboot cause to add additional rebot cause. + # If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot + # the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt + # will be treated as the additional reboot cause + # Elif there is a cmdline reboot cause, + # the software_reboot_cause will be treated as the reboot cause if it's not unknown + # otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none + # Else the software_reboot_cause will be treated as the reboot cause + if REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause: + previous_reboot_cause = hardware_reboot_cause + # Check if any software reboot was issued before this hardware reboot happened + if REBOOT_CAUSE_UNKNOWN not in software_reboot_cause: + additional_reboot_info = software_reboot_cause + elif proc_cmdline_reboot_cause is not None: + additional_reboot_info = proc_cmdline_reboot_cause + elif proc_cmdline_reboot_cause is not None: + if REBOOT_CAUSE_UNKNOWN not in software_reboot_cause: + # Get the reboot cause from REBOOT_CAUSE_FILE + previous_reboot_cause = software_reboot_cause + else: + previous_reboot_cause = proc_cmdline_reboot_cause + else: + previous_reboot_cause = software_reboot_cause + + return previous_reboot_cause, additional_reboot_info + def main(): # Configure logger to log all messages INFO level and higher @@ -177,22 +226,7 @@ def main(): if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE): os.remove(PREVIOUS_REBOOT_CAUSE_FILE) - # This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides - # any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE - additional_reboot_info = "N/A" - - # Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline - proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause() - - # If /proc/cmdline does not indicate reboot cause, check if the previous reboot was caused by hardware - if proc_cmdline_reboot_cause is None: - previous_reboot_cause = find_hardware_reboot_cause() - if previous_reboot_cause.startswith(REBOOT_CAUSE_NON_HARDWARE): - # If the reboot cause is non-hardware, get the reboot cause from REBOOT_CAUSE_FILE - previous_reboot_cause = find_software_reboot_cause() - else: - # Get the reboot cause from REBOOT_CAUSE_FILE - previous_reboot_cause = find_software_reboot_cause() + previous_reboot_cause, additional_reboot_info = determine_reboot_cause() # Current time reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) diff --git a/src/sonic-host-services/tests/determine-reboot-cause_test.py b/src/sonic-host-services/tests/determine-reboot-cause_test.py index 7d22a512f8..b4c07af468 100644 --- a/src/sonic-host-services/tests/determine-reboot-cause_test.py +++ b/src/sonic-host-services/tests/determine-reboot-cause_test.py @@ -54,11 +54,16 @@ REBOOT_CAUSE_KERNEL_PANIC = "Kernel Panic [Time: Sun Mar 28 13:45:12 UTC 2021]" GEN_TIME_KERNEL_PANIC = "2021_3_28_13_48_49" +REBOOT_CAUSE_UNKNOWN = "Unknown" +REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware" +EXPECTED_NON_HARDWARE_REBOOT_CAUSE = {REBOOT_CAUSE_NON_HARDWARE, "N/A"} +REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other" +EXPECTED_HARDWARE_REBOOT_CAUSE = {REBOOT_CAUSE_HARDWARE_OTHER, ""} + EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE = "warm-reboot" EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER = "User issued 'warm-reboot' command [User: admin, Time: Mon Nov 2 22:37:45 UTC 2020]" EXPECTED_FIND_FIRSTBOOT_VERSION = " (First boot of SONiC version 20191130.52)" EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_FIRSTBOOT = "Unknown (First boot of SONiC version 20191130.52)" -EXPECTED_HARDWARE_REBOOT_CAUSE = {"warm-reboot", ""} EXPECTED_WATCHDOG_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_15_08', 'cause': 'Watchdog', 'user': 'N/A', 'time': 'N/A'} EXPECTED_USER_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_14_07', 'cause': 'reboot', 'user': 'admin', 'time': 'Thu Oct 22 03:11:08 UTC 2020'} @@ -104,7 +109,12 @@ class TestDetermineRebootCause(object): def test_find_hardware_reboot_cause(self): with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)): result = determine_reboot_cause.find_hardware_reboot_cause() - assert result == "Powerloss (None)" + assert result == "Powerloss" + + def test_find_hardware_reboot_cause_with_minor(self): + with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", "under-voltage")): + result = determine_reboot_cause.find_hardware_reboot_cause() + assert result == "Powerloss (under-voltage)" def test_get_reboot_cause_dict_watchdog(self): reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG) @@ -117,3 +127,52 @@ class TestDetermineRebootCause(object): def test_get_reboot_cause_dict_kernel_panic(self): reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_KERNEL_PANIC, "", GEN_TIME_KERNEL_PANIC) assert reboot_cause_dict == EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT + + def test_determine_reboot_cause_hardware(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=None): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_reboot_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE + assert additional_reboot_info == "N/A" + + def test_determine_reboot_cause_software(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=None): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER + assert additional_info == "N/A" + + def test_determine_reboot_cause_cmdline_software(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER + assert additional_info == "N/A" + + def test_determine_reboot_cause_cmdline_no_software(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE + assert additional_info == "N/A" + + def test_determine_reboot_cause_cmdline_hardware(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE + assert additional_info == EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE + + def test_determine_reboot_cause_software_hardware(self): + with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE): + with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER): + with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE): + previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause() + assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE + assert additional_info == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER +