Add the hw reboot cause if it happened during a software reboot (#11792)
* Add the hw reboot cause if it happened during a software reboot * fix the unknown software reboot cause
This commit is contained in:
parent
32bee5df2d
commit
42e29246b9
@ -122,7 +122,12 @@ def find_hardware_reboot_cause():
|
|||||||
else:
|
else:
|
||||||
sonic_logger.log_info("No reboot cause found from platform api")
|
sonic_logger.log_info("No reboot cause found from platform api")
|
||||||
|
|
||||||
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
|
hardware_reboot_cause_minor_str = ""
|
||||||
|
if hardware_reboot_cause_minor:
|
||||||
|
hardware_reboot_cause_minor_str = " ({})".format(hardware_reboot_cause_minor)
|
||||||
|
|
||||||
|
hardware_reboot_cause = hardware_reboot_cause_major + hardware_reboot_cause_minor_str
|
||||||
|
|
||||||
return hardware_reboot_cause
|
return hardware_reboot_cause
|
||||||
|
|
||||||
|
|
||||||
@ -158,6 +163,50 @@ def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
|
|||||||
|
|
||||||
return reboot_cause_dict
|
return reboot_cause_dict
|
||||||
|
|
||||||
|
def determine_reboot_cause():
|
||||||
|
# This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
|
||||||
|
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
|
||||||
|
additional_reboot_info = "N/A"
|
||||||
|
|
||||||
|
# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
|
||||||
|
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
|
||||||
|
|
||||||
|
# 2. Check if the previous reboot was caused by hardware
|
||||||
|
# If yes, the hardware reboot cause will be treated as the reboot cause
|
||||||
|
hardware_reboot_cause = find_hardware_reboot_cause()
|
||||||
|
|
||||||
|
# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
|
||||||
|
# reboot info. We will use it as the previous cause.
|
||||||
|
software_reboot_cause = find_software_reboot_cause()
|
||||||
|
|
||||||
|
# The main decision logic of the reboot cause:
|
||||||
|
# If there is a valid hardware reboot cause indicated by platform API,
|
||||||
|
# check the software reboot cause to add additional rebot cause.
|
||||||
|
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
|
||||||
|
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
|
||||||
|
# will be treated as the additional reboot cause
|
||||||
|
# Elif there is a cmdline reboot cause,
|
||||||
|
# the software_reboot_cause will be treated as the reboot cause if it's not unknown
|
||||||
|
# otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none
|
||||||
|
# Else the software_reboot_cause will be treated as the reboot cause
|
||||||
|
if REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
|
||||||
|
previous_reboot_cause = hardware_reboot_cause
|
||||||
|
# Check if any software reboot was issued before this hardware reboot happened
|
||||||
|
if REBOOT_CAUSE_UNKNOWN not in software_reboot_cause:
|
||||||
|
additional_reboot_info = software_reboot_cause
|
||||||
|
elif proc_cmdline_reboot_cause is not None:
|
||||||
|
additional_reboot_info = proc_cmdline_reboot_cause
|
||||||
|
elif proc_cmdline_reboot_cause is not None:
|
||||||
|
if REBOOT_CAUSE_UNKNOWN not in software_reboot_cause:
|
||||||
|
# Get the reboot cause from REBOOT_CAUSE_FILE
|
||||||
|
previous_reboot_cause = software_reboot_cause
|
||||||
|
else:
|
||||||
|
previous_reboot_cause = proc_cmdline_reboot_cause
|
||||||
|
else:
|
||||||
|
previous_reboot_cause = software_reboot_cause
|
||||||
|
|
||||||
|
return previous_reboot_cause, additional_reboot_info
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Configure logger to log all messages INFO level and higher
|
# Configure logger to log all messages INFO level and higher
|
||||||
@ -177,22 +226,7 @@ def main():
|
|||||||
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
|
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
|
||||||
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
|
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
|
||||||
|
|
||||||
# This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
|
previous_reboot_cause, additional_reboot_info = determine_reboot_cause()
|
||||||
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
|
|
||||||
additional_reboot_info = "N/A"
|
|
||||||
|
|
||||||
# Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
|
|
||||||
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
|
|
||||||
|
|
||||||
# If /proc/cmdline does not indicate reboot cause, check if the previous reboot was caused by hardware
|
|
||||||
if proc_cmdline_reboot_cause is None:
|
|
||||||
previous_reboot_cause = find_hardware_reboot_cause()
|
|
||||||
if previous_reboot_cause.startswith(REBOOT_CAUSE_NON_HARDWARE):
|
|
||||||
# If the reboot cause is non-hardware, get the reboot cause from REBOOT_CAUSE_FILE
|
|
||||||
previous_reboot_cause = find_software_reboot_cause()
|
|
||||||
else:
|
|
||||||
# Get the reboot cause from REBOOT_CAUSE_FILE
|
|
||||||
previous_reboot_cause = find_software_reboot_cause()
|
|
||||||
|
|
||||||
# Current time
|
# Current time
|
||||||
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
|
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
|
||||||
|
@ -54,11 +54,16 @@ REBOOT_CAUSE_KERNEL_PANIC = "Kernel Panic [Time: Sun Mar 28 13:45:12 UTC 2021]"
|
|||||||
GEN_TIME_KERNEL_PANIC = "2021_3_28_13_48_49"
|
GEN_TIME_KERNEL_PANIC = "2021_3_28_13_48_49"
|
||||||
|
|
||||||
|
|
||||||
|
REBOOT_CAUSE_UNKNOWN = "Unknown"
|
||||||
|
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
|
||||||
|
EXPECTED_NON_HARDWARE_REBOOT_CAUSE = {REBOOT_CAUSE_NON_HARDWARE, "N/A"}
|
||||||
|
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
|
||||||
|
EXPECTED_HARDWARE_REBOOT_CAUSE = {REBOOT_CAUSE_HARDWARE_OTHER, ""}
|
||||||
|
|
||||||
EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE = "warm-reboot"
|
EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE = "warm-reboot"
|
||||||
EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER = "User issued 'warm-reboot' command [User: admin, Time: Mon Nov 2 22:37:45 UTC 2020]"
|
EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER = "User issued 'warm-reboot' command [User: admin, Time: Mon Nov 2 22:37:45 UTC 2020]"
|
||||||
EXPECTED_FIND_FIRSTBOOT_VERSION = " (First boot of SONiC version 20191130.52)"
|
EXPECTED_FIND_FIRSTBOOT_VERSION = " (First boot of SONiC version 20191130.52)"
|
||||||
EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_FIRSTBOOT = "Unknown (First boot of SONiC version 20191130.52)"
|
EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_FIRSTBOOT = "Unknown (First boot of SONiC version 20191130.52)"
|
||||||
EXPECTED_HARDWARE_REBOOT_CAUSE = {"warm-reboot", ""}
|
|
||||||
|
|
||||||
EXPECTED_WATCHDOG_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_15_08', 'cause': 'Watchdog', 'user': 'N/A', 'time': 'N/A'}
|
EXPECTED_WATCHDOG_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_15_08', 'cause': 'Watchdog', 'user': 'N/A', 'time': 'N/A'}
|
||||||
EXPECTED_USER_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_14_07', 'cause': 'reboot', 'user': 'admin', 'time': 'Thu Oct 22 03:11:08 UTC 2020'}
|
EXPECTED_USER_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_14_07', 'cause': 'reboot', 'user': 'admin', 'time': 'Thu Oct 22 03:11:08 UTC 2020'}
|
||||||
@ -104,7 +109,12 @@ class TestDetermineRebootCause(object):
|
|||||||
def test_find_hardware_reboot_cause(self):
|
def test_find_hardware_reboot_cause(self):
|
||||||
with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)):
|
with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", None)):
|
||||||
result = determine_reboot_cause.find_hardware_reboot_cause()
|
result = determine_reboot_cause.find_hardware_reboot_cause()
|
||||||
assert result == "Powerloss (None)"
|
assert result == "Powerloss"
|
||||||
|
|
||||||
|
def test_find_hardware_reboot_cause_with_minor(self):
|
||||||
|
with mock.patch("determine_reboot_cause.get_reboot_cause_from_platform", return_value=("Powerloss", "under-voltage")):
|
||||||
|
result = determine_reboot_cause.find_hardware_reboot_cause()
|
||||||
|
assert result == "Powerloss (under-voltage)"
|
||||||
|
|
||||||
def test_get_reboot_cause_dict_watchdog(self):
|
def test_get_reboot_cause_dict_watchdog(self):
|
||||||
reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG)
|
reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_WATCHDOG, "", GEN_TIME_WATCHDOG)
|
||||||
@ -117,3 +127,52 @@ class TestDetermineRebootCause(object):
|
|||||||
def test_get_reboot_cause_dict_kernel_panic(self):
|
def test_get_reboot_cause_dict_kernel_panic(self):
|
||||||
reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_KERNEL_PANIC, "", GEN_TIME_KERNEL_PANIC)
|
reboot_cause_dict = determine_reboot_cause.get_reboot_cause_dict(REBOOT_CAUSE_KERNEL_PANIC, "", GEN_TIME_KERNEL_PANIC)
|
||||||
assert reboot_cause_dict == EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT
|
assert reboot_cause_dict == EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_hardware(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=None):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_reboot_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE
|
||||||
|
assert additional_reboot_info == "N/A"
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_software(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=None):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER
|
||||||
|
assert additional_info == "N/A"
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_cmdline_software(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER
|
||||||
|
assert additional_info == "N/A"
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_cmdline_no_software(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_NON_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE
|
||||||
|
assert additional_info == "N/A"
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_cmdline_hardware(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=REBOOT_CAUSE_UNKNOWN):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE
|
||||||
|
assert additional_info == EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE
|
||||||
|
|
||||||
|
def test_determine_reboot_cause_software_hardware(self):
|
||||||
|
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE):
|
||||||
|
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER):
|
||||||
|
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE):
|
||||||
|
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
|
||||||
|
assert previous_reboot_cause == EXPECTED_HARDWARE_REBOOT_CAUSE
|
||||||
|
assert additional_info == EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_USER
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user