[201811] Check platform reboot cause to see if any reset happened during fast/warm-reboot (#8912)
[201811] Check platform reboot cause to see if any reset happened during fast/warm-reboot Why I did it To recover syncd and swss from any cold reset during fast/warm-reboot How I did it Check platform reboot-cause to see if any cold reset happens for fast-reboot power up How to verify it Manual test
This commit is contained in:
parent
9e75e856eb
commit
a80319e2d0
@ -2,6 +2,7 @@
|
|||||||
Description=Database container
|
Description=Database container
|
||||||
Requires=docker.service
|
Requires=docker.service
|
||||||
After=docker.service
|
After=docker.service
|
||||||
|
After=rc-local.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
User=root
|
User=root
|
||||||
|
@ -5,6 +5,17 @@ function getMountPoint()
|
|||||||
echo $1 | python -c "import sys, json, os; mnts = [x for x in json.load(sys.stdin)[0]['Mounts'] if x['Destination'] == '/usr/share/sonic/hwsku']; print '' if len(mnts) == 0 else os.path.basename(mnts[0]['Source'])" 2>/dev/null
|
echo $1 | python -c "import sys, json, os; mnts = [x for x in json.load(sys.stdin)[0]['Mounts'] if x['Destination'] == '/usr/share/sonic/hwsku']; print '' if len(mnts) == 0 else os.path.basename(mnts[0]['Source'])" 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isPlatformReset()
|
||||||
|
{
|
||||||
|
output=$(python -c "import sonic_platform.platform; p = sonic_platform.platform.Platform(); c = p.get_chassis(); hw_rc_major, hw_rc_minor = c.get_reboot_cause(); print(hw_rc_major)" 2>/dev/null)
|
||||||
|
|
||||||
|
if [[ "${output}" != "Non-Hardware" ]]; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
function getBootType()
|
function getBootType()
|
||||||
{
|
{
|
||||||
# same code snippet in files/scripts/syncd.sh
|
# same code snippet in files/scripts/syncd.sh
|
||||||
@ -21,6 +32,9 @@ function getBootType()
|
|||||||
*)
|
*)
|
||||||
TYPE='cold'
|
TYPE='cold'
|
||||||
esac
|
esac
|
||||||
|
if isPlatformReset; then
|
||||||
|
TYPE='cold'
|
||||||
|
fi
|
||||||
echo "${TYPE}"
|
echo "${TYPE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -59,7 +73,7 @@ function postStartAction()
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$BOOT_TYPE" == "fast" ]]; then
|
if [[ "$BOOT_TYPE" == "fast" ]]; then
|
||||||
# set the key to expire in 3 minutes
|
# set the FAST_REBOOT|system key to expire in 3 minutes
|
||||||
redis-cli -n 6 SET "FAST_REBOOT|system" "1" "EX" "180"
|
redis-cli -n 6 SET "FAST_REBOOT|system" "1" "EX" "180"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -268,6 +268,11 @@ sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause.service $FILES
|
|||||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable process-reboot-cause.service
|
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable process-reboot-cause.service
|
||||||
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause $FILESYSTEM_ROOT/usr/bin/
|
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause $FILESYSTEM_ROOT/usr/bin/
|
||||||
|
|
||||||
|
# Copy determine-reboot-cause service files
|
||||||
|
sudo cp $IMAGE_CONFIGS/process-reboot-cause/determine-reboot-cause.service $FILESYSTEM_ROOT/etc/systemd/system/
|
||||||
|
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable determine-reboot-cause.service
|
||||||
|
sudo cp $IMAGE_CONFIGS/process-reboot-cause/determine-reboot-cause $FILESYSTEM_ROOT/usr/bin/
|
||||||
|
|
||||||
## Install package without starting service
|
## Install package without starting service
|
||||||
## ref: https://wiki.debian.org/chroot
|
## ref: https://wiki.debian.org/chroot
|
||||||
sudo tee -a $FILESYSTEM_ROOT/usr/sbin/policy-rc.d > /dev/null <<EOF
|
sudo tee -a $FILESYSTEM_ROOT/usr/sbin/policy-rc.d > /dev/null <<EOF
|
||||||
|
@ -8,6 +8,7 @@ Requires=nps-modules-4.9.0-14-amd64.service
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
After=database.service updategraph.service
|
After=database.service updategraph.service
|
||||||
After=interfaces-config.service
|
After=interfaces-config.service
|
||||||
|
After=pmon.service
|
||||||
Before=ntp-config.service
|
Before=ntp-config.service
|
||||||
StartLimitIntervalSec=1200
|
StartLimitIntervalSec=1200
|
||||||
StartLimitBurst=3
|
StartLimitBurst=3
|
||||||
|
248
files/image_config/process-reboot-cause/determine-reboot-cause
Normal file
248
files/image_config/process-reboot-cause/determine-reboot-cause
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
#
|
||||||
|
# determine-reboot-cause
|
||||||
|
#
|
||||||
|
# Program designed to run once, soon after system boot which will
|
||||||
|
# determine the cause of the previous reboot and store it to the disk,
|
||||||
|
#
|
||||||
|
|
||||||
|
try:
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pwd
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import syslog
|
||||||
|
|
||||||
|
import sonic_device_util
|
||||||
|
except ImportError as err:
|
||||||
|
raise ImportError("%s - required module not found" % str(err))
|
||||||
|
|
||||||
|
VERSION = "1.0"
|
||||||
|
|
||||||
|
SYSLOG_IDENTIFIER = "determine-reboot-cause"
|
||||||
|
|
||||||
|
REBOOT_CAUSE_DIR = "/host/reboot-cause/"
|
||||||
|
REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "reboot-cause.txt"
|
||||||
|
PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.txt"
|
||||||
|
PREVIOUS_REBOOT_CAUSE_FILE_JSON = REBOOT_CAUSE_DIR + "previous-reboot-cause.json"
|
||||||
|
FIRST_BOOT_PLATFORM_FILE = "/tmp/notify_firstboot_to_platform"
|
||||||
|
REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
|
||||||
|
# The following SONIC_BOOT_TYPEs come from the warm/fast reboot script which is in sonic-utilities
|
||||||
|
# Because the system can be rebooted from some old versions, we have to take all possible BOOT options into consideration.
|
||||||
|
# On 201803, 201807 we have
|
||||||
|
# BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') fast-reboot"
|
||||||
|
# On 201811 and later we have
|
||||||
|
# BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" where BOOT_TYPE_ARG can be warm, fastfast or fast
|
||||||
|
# To extract the commom part of them, we should have the following PATTERN
|
||||||
|
REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
|
||||||
|
REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*"
|
||||||
|
|
||||||
|
REBOOT_CAUSE_UNKNOWN = "Unknown"
|
||||||
|
|
||||||
|
|
||||||
|
# ========================== Syslog wrappers ==========================
|
||||||
|
|
||||||
|
def log_info(msg):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_INFO, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
|
||||||
|
def log_warning(msg):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_WARNING, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
|
||||||
|
def log_error(msg):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_ERR, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================= Functions =============================
|
||||||
|
def parse_warmfast_reboot_from_proc_cmdline():
|
||||||
|
if os.path.isfile(REBOOT_TYPE_KEXEC_FILE):
|
||||||
|
with open(REBOOT_TYPE_KEXEC_FILE, "r") as cause_file:
|
||||||
|
cause_file_kexec = cause_file.readline()
|
||||||
|
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_WARM, cause_file_kexec)
|
||||||
|
if m and m.group(1):
|
||||||
|
return 'warm-reboot'
|
||||||
|
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_FAST, cause_file_kexec)
|
||||||
|
if m and m.group(1):
|
||||||
|
return 'fast-reboot'
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_software_reboot_cause():
|
||||||
|
software_reboot_cause = REBOOT_CAUSE_UNKNOWN
|
||||||
|
|
||||||
|
if os.path.isfile(REBOOT_CAUSE_FILE):
|
||||||
|
with open(REBOOT_CAUSE_FILE, "r") as cause_file:
|
||||||
|
software_reboot_cause = cause_file.readline().rstrip('\n')
|
||||||
|
log_info("{} indicates the reboot cause: {}".format(REBOOT_CAUSE_FILE, software_reboot_cause))
|
||||||
|
else:
|
||||||
|
log_info("Reboot cause file {} not found".format(REBOOT_CAUSE_FILE))
|
||||||
|
|
||||||
|
if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
|
||||||
|
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
|
||||||
|
version_info = sonic_device_util.get_sonic_version_info()
|
||||||
|
build_version = version_info['build_version'] if version_info else "unknown"
|
||||||
|
software_reboot_cause += " (First boot of SONiC version {})".format(build_version)
|
||||||
|
os.remove(FIRST_BOOT_PLATFORM_FILE)
|
||||||
|
|
||||||
|
return software_reboot_cause
|
||||||
|
|
||||||
|
|
||||||
|
def find_proc_cmdline_reboot_cause():
|
||||||
|
proc_cmdline_reboot_cause = parse_warmfast_reboot_from_proc_cmdline()
|
||||||
|
|
||||||
|
if proc_cmdline_reboot_cause:
|
||||||
|
log_info("/proc/cmdline indicates reboot type: {}".format(proc_cmdline_reboot_cause))
|
||||||
|
else:
|
||||||
|
log_info("No reboot cause found from /proc/cmdline")
|
||||||
|
|
||||||
|
return proc_cmdline_reboot_cause
|
||||||
|
|
||||||
|
|
||||||
|
def find_hardware_reboot_cause():
|
||||||
|
hardware_reboot_cause = None
|
||||||
|
|
||||||
|
# Until all platform vendors have provided sonic_platform packages,
|
||||||
|
# if there is no sonic_platform package installed, we only provide
|
||||||
|
# software-related reboot causes.
|
||||||
|
try:
|
||||||
|
import sonic_platform
|
||||||
|
|
||||||
|
platform = sonic_platform.platform.Platform()
|
||||||
|
|
||||||
|
chassis = platform.get_chassis()
|
||||||
|
|
||||||
|
hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
|
||||||
|
|
||||||
|
if hardware_reboot_cause_major == chassis.REBOOT_CAUSE_NON_HARDWARE:
|
||||||
|
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
|
||||||
|
# contain any software-related reboot info. We will use it as the previous cause.
|
||||||
|
pass
|
||||||
|
elif hardware_reboot_cause_major == chassis.REBOOT_CAUSE_HARDWARE_OTHER:
|
||||||
|
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
|
||||||
|
else:
|
||||||
|
hardware_reboot_cause = hardware_reboot_cause_major
|
||||||
|
except ImportError as err:
|
||||||
|
log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
|
||||||
|
|
||||||
|
if hardware_reboot_cause:
|
||||||
|
log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
|
||||||
|
else:
|
||||||
|
log_info("No reboot cause found from platform api")
|
||||||
|
|
||||||
|
return hardware_reboot_cause
|
||||||
|
|
||||||
|
|
||||||
|
def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
|
||||||
|
"""Store the key infomation of device reboot into a dictionary by parsing the string in
|
||||||
|
previous_reboot_cause.
|
||||||
|
If user issused a command to reboot device, then user, command and time will be
|
||||||
|
stored into a dictionary.
|
||||||
|
If device was rebooted due to the kernel panic, then the string `Kernel Panic`
|
||||||
|
and time will be stored into a dictionary.
|
||||||
|
"""
|
||||||
|
reboot_cause_dict = {}
|
||||||
|
reboot_cause_dict['gen_time'] = gen_time
|
||||||
|
reboot_cause_dict['cause'] = previous_reboot_cause
|
||||||
|
reboot_cause_dict['user'] = "N/A"
|
||||||
|
reboot_cause_dict['time'] = "N/A"
|
||||||
|
reboot_cause_dict['comment'] = comment if comment is not None else "N/A"
|
||||||
|
if re.search(r'User issued', previous_reboot_cause):
|
||||||
|
# Match with "User issued '{}' command [User: {}, Time: {}]"
|
||||||
|
match = re.search(r'User issued \'(.*)\' command \[User: (.*), Time: (.*)\]', previous_reboot_cause)
|
||||||
|
if match is not None:
|
||||||
|
reboot_cause_dict['cause'] = match.group(1)
|
||||||
|
reboot_cause_dict['user'] = match.group(2)
|
||||||
|
reboot_cause_dict['time'] = match.group(3)
|
||||||
|
elif re.search(r'Kernel Panic', previous_reboot_cause):
|
||||||
|
match = re.search(r'Kernel Panic \[Time: (.*)\]', previous_reboot_cause)
|
||||||
|
if match is not None:
|
||||||
|
reboot_cause_dict['cause'] = "Kernel Panic"
|
||||||
|
reboot_cause_dict['time'] = match.group(1)
|
||||||
|
|
||||||
|
return reboot_cause_dict
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log_info("Starting up...")
|
||||||
|
|
||||||
|
if not os.geteuid() == 0:
|
||||||
|
log_error("User {} does not have permission to execute".format(pwd.getpwuid(os.getuid()).pw_name))
|
||||||
|
sys.exit("This utility must be run as root")
|
||||||
|
|
||||||
|
# Create REBOOT_CAUSE_DIR if it doesn't exist
|
||||||
|
if not os.path.exists(REBOOT_CAUSE_DIR):
|
||||||
|
os.makedirs(REBOOT_CAUSE_DIR)
|
||||||
|
|
||||||
|
# Remove stale PREVIOUS_REBOOT_CAUSE_FILE if it exists
|
||||||
|
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
|
||||||
|
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
|
||||||
|
|
||||||
|
# Set a default previous reboot cause
|
||||||
|
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN
|
||||||
|
|
||||||
|
# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
|
||||||
|
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
|
||||||
|
|
||||||
|
# 2. Check if the previous reboot was caused by hardware
|
||||||
|
# If yes, the hardware reboot cause will be treated as the reboot cause
|
||||||
|
hardware_reboot_cause = find_hardware_reboot_cause()
|
||||||
|
|
||||||
|
# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
|
||||||
|
# reboot info. We will use it as the previous cause.
|
||||||
|
software_reboot_cause = find_software_reboot_cause()
|
||||||
|
|
||||||
|
additional_reboot_info = None
|
||||||
|
|
||||||
|
# The main decision logic of the reboot cause:
|
||||||
|
# If there is a reboot cause indicated by /proc/cmdline, it should be warmreboot/fastreboot
|
||||||
|
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
|
||||||
|
# will be treated as the reboot cause
|
||||||
|
# Elif there is a reboot cause indicated by platform API,
|
||||||
|
# the hardware_reboot_cause will be treated as the reboot cause
|
||||||
|
# Else the software_reboot_cause will be treated as the reboot cause
|
||||||
|
if proc_cmdline_reboot_cause is not None:
|
||||||
|
previous_reboot_cause = software_reboot_cause
|
||||||
|
if not hardware_reboot_cause.startswith(REBOOT_CAUSE_NON_HARDWARE):
|
||||||
|
# Add the hardware_reboot_cause into additional_reboot_info
|
||||||
|
additional_reboot_info = hardware_reboot_cause
|
||||||
|
elif hardware_reboot_cause is not None:
|
||||||
|
previous_reboot_cause = hardware_reboot_cause
|
||||||
|
# Check if any software reboot was issued before this hardware reboot happened
|
||||||
|
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:
|
||||||
|
additional_reboot_info = software_reboot_cause
|
||||||
|
else:
|
||||||
|
previous_reboot_cause = software_reboot_cause
|
||||||
|
|
||||||
|
# Current time
|
||||||
|
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
|
||||||
|
|
||||||
|
# Save the previous cause info into its history file as json format
|
||||||
|
reboot_cause_dict = get_reboot_cause_dict(previous_reboot_cause, additional_reboot_info, reboot_cause_gen_time)
|
||||||
|
|
||||||
|
# Write the previous reboot cause to REBOOT_CAUSE_HISTORY_FILE_JSON as a JSON format
|
||||||
|
with open(PREVIOUS_REBOOT_CAUSE_FILE_JSON, "w") as reboot_cause_history_file:
|
||||||
|
json.dump(reboot_cause_dict, reboot_cause_history_file)
|
||||||
|
|
||||||
|
# Write the previous reboot cause to PREVIOUS_REBOOT_CAUSE_FILE
|
||||||
|
with open(PREVIOUS_REBOOT_CAUSE_FILE, "w") as prev_cause_file:
|
||||||
|
prev_cause_file.write(previous_reboot_cause)
|
||||||
|
|
||||||
|
# Remove the old REBOOT_CAUSE_FILE
|
||||||
|
if os.path.exists(REBOOT_CAUSE_FILE):
|
||||||
|
os.remove(REBOOT_CAUSE_FILE)
|
||||||
|
|
||||||
|
# Write a new default reboot cause file for the next reboot
|
||||||
|
with open(REBOOT_CAUSE_FILE, "w") as cause_file:
|
||||||
|
cause_file.write(REBOOT_CAUSE_UNKNOWN)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,7 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Reboot cause determination service
|
||||||
|
After=rc-local.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/bin/determine-reboot-cause
|
@ -7,11 +7,12 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import pwd
|
import pwd
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import syslog
|
import syslog
|
||||||
import re
|
|
||||||
|
|
||||||
import sonic_device_util
|
import sonic_device_util
|
||||||
except ImportError as err:
|
except ImportError as err:
|
||||||
@ -22,19 +23,7 @@ VERSION = "1.0"
|
|||||||
SYSLOG_IDENTIFIER = "process-reboot-cause"
|
SYSLOG_IDENTIFIER = "process-reboot-cause"
|
||||||
|
|
||||||
REBOOT_CAUSE_DIR = "/host/reboot-cause/"
|
REBOOT_CAUSE_DIR = "/host/reboot-cause/"
|
||||||
REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "reboot-cause.txt"
|
PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.json"
|
||||||
PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.txt"
|
|
||||||
FIRST_BOOT_PLATFORM_FILE = "/tmp/notify_firstboot_to_platform"
|
|
||||||
REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
|
|
||||||
# The following SONIC_BOOT_TYPEs come from the warm/fast reboot script which is in sonic-utilities
|
|
||||||
# Because the system can be rebooted from some old versions, we have to take all possible BOOT options into consideration.
|
|
||||||
# On 201803, 201807 we have
|
|
||||||
# BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') fast-reboot"
|
|
||||||
# On 201811 and later we have
|
|
||||||
# BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" where BOOT_TYPE_ARG can be warm, fastfast or fast
|
|
||||||
# To extract the commom part of them, we should have the following PATTERN
|
|
||||||
REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
|
|
||||||
REBOOT_TYPE_KEXEC_PATTERN_FAST = ".*SONIC_BOOT_TYPE=(fast|fast-reboot).*"
|
|
||||||
|
|
||||||
REBOOT_CAUSE_UNKNOWN = "Unknown"
|
REBOOT_CAUSE_UNKNOWN = "Unknown"
|
||||||
|
|
||||||
@ -60,142 +49,27 @@ def log_error(msg):
|
|||||||
|
|
||||||
|
|
||||||
# ============================= Functions =============================
|
# ============================= Functions =============================
|
||||||
def parse_warmfast_reboot_from_proc_cmdline():
|
|
||||||
if os.path.isfile(REBOOT_TYPE_KEXEC_FILE):
|
|
||||||
with open(REBOOT_TYPE_KEXEC_FILE, "r") as cause_file:
|
|
||||||
cause_file_kexec = cause_file.readline()
|
|
||||||
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_WARM, cause_file_kexec)
|
|
||||||
if m and m.group(1):
|
|
||||||
return 'warm-reboot'
|
|
||||||
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_FAST, cause_file_kexec)
|
|
||||||
if m and m.group(1):
|
|
||||||
return 'fast-reboot'
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def find_software_reboot_cause():
|
|
||||||
software_reboot_cause = REBOOT_CAUSE_UNKNOWN
|
|
||||||
|
|
||||||
if os.path.isfile(REBOOT_CAUSE_FILE):
|
|
||||||
with open(REBOOT_CAUSE_FILE, "r") as cause_file:
|
|
||||||
software_reboot_cause = cause_file.readline().rstrip('\n')
|
|
||||||
log_info("{} indicates the reboot cause: {}".format(REBOOT_CAUSE_FILE, software_reboot_cause))
|
|
||||||
else:
|
|
||||||
log_info("Reboot cause file {} not found".format(REBOOT_CAUSE_FILE))
|
|
||||||
|
|
||||||
if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
|
|
||||||
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
|
|
||||||
version_info = sonic_device_util.get_sonic_version_info()
|
|
||||||
build_version = version_info['build_version'] if version_info else "unknown"
|
|
||||||
software_reboot_cause += " (First boot of SONiC version {})".format(build_version)
|
|
||||||
os.remove(FIRST_BOOT_PLATFORM_FILE)
|
|
||||||
|
|
||||||
return software_reboot_cause
|
|
||||||
|
|
||||||
|
|
||||||
def find_proc_cmdline_reboot_cause():
|
|
||||||
proc_cmdline_reboot_cause = parse_warmfast_reboot_from_proc_cmdline()
|
|
||||||
|
|
||||||
if proc_cmdline_reboot_cause:
|
|
||||||
log_info("/proc/cmdline indicates reboot type: {}".format(proc_cmdline_reboot_cause))
|
|
||||||
else:
|
|
||||||
log_info("No reboot cause found from /proc/cmdline")
|
|
||||||
|
|
||||||
return proc_cmdline_reboot_cause
|
|
||||||
|
|
||||||
|
|
||||||
def find_hardware_reboot_cause():
|
|
||||||
hardware_reboot_cause = None
|
|
||||||
|
|
||||||
# Until all platform vendors have provided sonic_platform packages,
|
|
||||||
# if there is no sonic_platform package installed, we only provide
|
|
||||||
# software-related reboot causes.
|
|
||||||
try:
|
|
||||||
import sonic_platform
|
|
||||||
|
|
||||||
platform = sonic_platform.platform.Platform()
|
|
||||||
|
|
||||||
chassis = platform.get_chassis()
|
|
||||||
|
|
||||||
hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
|
|
||||||
|
|
||||||
if hardware_reboot_cause_major == chassis.REBOOT_CAUSE_NON_HARDWARE:
|
|
||||||
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
|
|
||||||
# contain any software-related reboot info. We will use it as the previous cause.
|
|
||||||
pass
|
|
||||||
elif hardware_reboot_cause_major == chassis.REBOOT_CAUSE_HARDWARE_OTHER:
|
|
||||||
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
|
|
||||||
else:
|
|
||||||
hardware_reboot_cause = hardware_reboot_cause_major
|
|
||||||
except ImportError as err:
|
|
||||||
log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
|
|
||||||
|
|
||||||
if hardware_reboot_cause:
|
|
||||||
log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
|
|
||||||
else:
|
|
||||||
log_info("No reboot cause found from platform api")
|
|
||||||
|
|
||||||
return hardware_reboot_cause
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
log_info("Starting up...")
|
log_info("Logging reboot-cause...")
|
||||||
|
|
||||||
if not os.geteuid() == 0:
|
if not os.geteuid() == 0:
|
||||||
log_error("User {} does not have permission to execute".format(pwd.getpwuid(os.getuid()).pw_name))
|
log_error("User {} does not have permission to execute".format(pwd.getpwuid(os.getuid()).pw_name))
|
||||||
sys.exit("This utility must be run as root")
|
sys.exit("This utility must be run as root")
|
||||||
|
|
||||||
# Create REBOOT_CAUSE_DIR if it doesn't exist
|
|
||||||
if not os.path.exists(REBOOT_CAUSE_DIR):
|
|
||||||
os.makedirs(REBOOT_CAUSE_DIR)
|
|
||||||
|
|
||||||
# Remove stale PREVIOUS_REBOOT_CAUSE_FILE if it exists
|
|
||||||
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
|
|
||||||
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)
|
|
||||||
|
|
||||||
# Set a default previous reboot cause
|
# Set a default previous reboot cause
|
||||||
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN
|
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN
|
||||||
|
|
||||||
# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
|
# Read the most recent reboot cause file and log data to syslog
|
||||||
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()
|
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
|
||||||
|
with open(PREVIOUS_REBOOT_CAUSE_FILE, "r") as last_cause_file:
|
||||||
|
data = json.load(last_cause_file)
|
||||||
|
if data['user']:
|
||||||
|
previous_reboot_cause = USER_ISSUED_REBOOT_CAUSE_REGEX.format(data['cause'], data['user'], data['time'])
|
||||||
|
else:
|
||||||
|
previous_reboot_cause = "{}".format(data['cause'])
|
||||||
|
|
||||||
# 2. Check if the previous reboot was caused by hardware
|
# Log the last reboot cause to the syslog
|
||||||
# If yes, the hardware reboot cause will be treated as the reboot cause
|
|
||||||
hardware_reboot_cause = find_hardware_reboot_cause()
|
|
||||||
|
|
||||||
# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
|
|
||||||
# reboot info. We will use it as the previous cause.
|
|
||||||
software_reboot_cause = find_software_reboot_cause()
|
|
||||||
|
|
||||||
# The main decision logic of the reboot cause:
|
|
||||||
# If there is a reboot cause indicated by /proc/cmdline, it should be warmreboot/fastreboot
|
|
||||||
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
|
|
||||||
# will be treated as the reboot cause
|
|
||||||
# Elif there is a reboot cause indicated by platform API,
|
|
||||||
# the hardware_reboot_cause will be treated as the reboot cause
|
|
||||||
# Else the software_reboot_cause will be treated as the reboot cause
|
|
||||||
if proc_cmdline_reboot_cause is not None:
|
|
||||||
previous_reboot_cause = software_reboot_cause
|
|
||||||
elif hardware_reboot_cause is not None:
|
|
||||||
previous_reboot_cause = hardware_reboot_cause
|
|
||||||
else:
|
|
||||||
previous_reboot_cause = software_reboot_cause
|
|
||||||
|
|
||||||
# Write the previous reboot cause to PREVIOUS_REBOOT_CAUSE_FILE
|
|
||||||
with open(PREVIOUS_REBOOT_CAUSE_FILE, "w") as prev_cause_file:
|
|
||||||
prev_cause_file.write(previous_reboot_cause)
|
|
||||||
|
|
||||||
# Also log the previous reboot cause to the syslog
|
|
||||||
log_info("Previous reboot cause: {}".format(previous_reboot_cause))
|
log_info("Previous reboot cause: {}".format(previous_reboot_cause))
|
||||||
|
|
||||||
# Remove the old REBOOT_CAUSE_FILE
|
|
||||||
if os.path.exists(REBOOT_CAUSE_FILE):
|
|
||||||
os.remove(REBOOT_CAUSE_FILE)
|
|
||||||
|
|
||||||
# Write a new default reboot cause file for the next reboot
|
|
||||||
with open(REBOOT_CAUSE_FILE, "w") as cause_file:
|
|
||||||
cause_file.write(REBOOT_CAUSE_UNKNOWN)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user