[platform/cel-haliburton]: add watchdog service (#6259)
Haliburton needed watchdog daemon to monitor the basic health of a machine. If something goes wrong, such as a crashing program overloading the CPU, or no more free memory on the system, watchdog can safely reboot the machine,
This commit is contained in:
parent
d609b406be
commit
a416f49676
@ -1,2 +1,4 @@
|
|||||||
haliburton/cfg/haliburton-modules.conf etc/modules-load.d
|
haliburton/cfg/haliburton-modules.conf etc/modules-load.d
|
||||||
haliburton/systemd/platform-modules-haliburton.service lib/systemd/system
|
haliburton/systemd/platform-modules-haliburton.service lib/systemd/system
|
||||||
|
haliburton/systemd/cpu_wdt.service lib/systemd/system
|
||||||
|
haliburton/scripts/cpu_wdt /usr/local/bin/
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
depmod -a
|
depmod -a
|
||||||
systemctl enable platform-modules-haliburton.service
|
systemctl enable platform-modules-haliburton.service
|
||||||
systemctl start platform-modules-haliburton.service
|
systemctl start platform-modules-haliburton.service
|
||||||
|
|
||||||
|
systemctl enable cpu_wdt.service
|
||||||
|
systemctl start cpu_wdt.service
|
||||||
|
321
platform/broadcom/sonic-platform-modules-cel/haliburton/scripts/cpu_wdt
Executable file
321
platform/broadcom/sonic-platform-modules-cel/haliburton/scripts/cpu_wdt
Executable file
@ -0,0 +1,321 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
import syslog
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from sonic_platform_base.watchdog_base import WatchdogBase
|
||||||
|
|
||||||
|
SYSLOG_IDENTIFIER = 'cpu_wdt'
|
||||||
|
CPUWDT_MAIN_TASK_RUNNING_FLAG = True
|
||||||
|
|
||||||
|
PLATFORM_CPLD_PATH = '/sys/devices/platform/e1031.smc/'
|
||||||
|
SETREG_FILE = 'setreg'
|
||||||
|
GETREG_FILE = 'getreg'
|
||||||
|
WDT_COMMON_ERROR = -1
|
||||||
|
MMC_VERSION_REG = "0x100"
|
||||||
|
|
||||||
|
# watchdog infomation for cpld v06
|
||||||
|
V06_MMC_VERSION = 0x05
|
||||||
|
V06_WDT_WIDTH = '0x110'
|
||||||
|
V06_WDT_WIDTH_SELECTOR = {
|
||||||
|
30: '0x1',
|
||||||
|
60: '0x2',
|
||||||
|
180: '0x3'
|
||||||
|
}
|
||||||
|
|
||||||
|
V06_CPLD_WDT_INFO = {
|
||||||
|
'wdt_en_reg': '0x111',
|
||||||
|
'wdt_en_cmd': '0x0',
|
||||||
|
'wdt_dis_cmd': '0x1'
|
||||||
|
}
|
||||||
|
|
||||||
|
# watchdog infomation
|
||||||
|
WDT_TIMER_L_BIT_REG = '0x117'
|
||||||
|
WDT_TIMER_M_BIT_REG = '0x118'
|
||||||
|
WDT_TIMER_H_BIT_REG = '0x119'
|
||||||
|
WDT_KEEP_ALVIVE_REG = '0x11a'
|
||||||
|
|
||||||
|
CPLD_WDT_INFO = {
|
||||||
|
'wdt_en_reg': '0x116',
|
||||||
|
'wdt_en_cmd': '0x1',
|
||||||
|
'wdt_dis_cmd': '0x0'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# default input
|
||||||
|
DEFAULT_TIMEOUT = 180
|
||||||
|
DEFAULT_KEEPALIVE = 60
|
||||||
|
|
||||||
|
|
||||||
|
# ========================== Syslog wrappers ==========================
|
||||||
|
|
||||||
|
|
||||||
|
def log_info(msg, also_print_to_console=False):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_INFO, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
if also_print_to_console:
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def log_warning(msg, also_print_to_console=False):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_WARNING, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
if also_print_to_console:
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def log_error(msg, also_print_to_console=False):
|
||||||
|
syslog.openlog(SYSLOG_IDENTIFIER)
|
||||||
|
syslog.syslog(syslog.LOG_ERR, msg)
|
||||||
|
syslog.closelog()
|
||||||
|
|
||||||
|
if also_print_to_console:
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
class Watchdog(WatchdogBase):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Init cpld reg path
|
||||||
|
self.setreg_path = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)
|
||||||
|
self.getreg_path = os.path.join(PLATFORM_CPLD_PATH, GETREG_FILE)
|
||||||
|
|
||||||
|
self.mmc_v = self._get_mmc_version()
|
||||||
|
self.cpld_info = V06_CPLD_WDT_INFO if self.mmc_v <= V06_MMC_VERSION else CPLD_WDT_INFO
|
||||||
|
|
||||||
|
# Set default value
|
||||||
|
self._disable()
|
||||||
|
self.armed = False
|
||||||
|
self.timeout = 0
|
||||||
|
|
||||||
|
def _get_mmc_version(self):
|
||||||
|
hex_str_v = self._get_register_value(MMC_VERSION_REG)
|
||||||
|
return int(hex_str_v, 16)
|
||||||
|
|
||||||
|
def _get_register_value(self, register):
|
||||||
|
# Retrieves the value in the cpld register.
|
||||||
|
self._write_reg(self.getreg_path, register)
|
||||||
|
return self._read_reg(self.getreg_path)
|
||||||
|
|
||||||
|
def _write_reg(self, file_path, value):
|
||||||
|
with open(file_path, 'w') as fd:
|
||||||
|
fd.write(str(value))
|
||||||
|
|
||||||
|
def _read_reg(self, path):
|
||||||
|
with open(path, 'r') as fd:
|
||||||
|
output = fd.readline()
|
||||||
|
return output.strip('\n')
|
||||||
|
|
||||||
|
def _get_level_hex(self, sub_hex):
|
||||||
|
sub_hex_str = sub_hex.replace("x", "0")
|
||||||
|
return hex(int(sub_hex_str, 16))
|
||||||
|
|
||||||
|
def _seconds_to_lmh_hex(self, seconds):
|
||||||
|
ms = seconds*1000 # calculate timeout in ms format
|
||||||
|
hex_str = hex(ms)
|
||||||
|
l = self._get_level_hex(hex_str[-2:])
|
||||||
|
m = self._get_level_hex(hex_str[-4:-2])
|
||||||
|
h = self._get_level_hex(hex_str[-6:-4])
|
||||||
|
return (l, m, h)
|
||||||
|
|
||||||
|
def _enable(self):
|
||||||
|
"""
|
||||||
|
Turn on the watchdog timer
|
||||||
|
"""
|
||||||
|
enable_val = '{} {}'.format(
|
||||||
|
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_en_cmd'])
|
||||||
|
return self._write_reg(self.setreg_path, enable_val)
|
||||||
|
|
||||||
|
def _disable(self):
|
||||||
|
"""
|
||||||
|
Turn off the watchdog timer
|
||||||
|
"""
|
||||||
|
disable_val = '{} {}'.format(
|
||||||
|
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_dis_cmd'])
|
||||||
|
return self._write_reg(self.setreg_path, disable_val)
|
||||||
|
|
||||||
|
def _keepalive(self):
|
||||||
|
"""
|
||||||
|
Keep alive watchdog timer
|
||||||
|
"""
|
||||||
|
if self.mmc_v <= V06_MMC_VERSION:
|
||||||
|
self._disable()
|
||||||
|
self._enable()
|
||||||
|
|
||||||
|
else:
|
||||||
|
enable_val = '{} {}'.format(
|
||||||
|
WDT_KEEP_ALVIVE_REG, self.cpld_info['wdt_en_cmd'])
|
||||||
|
self._write_reg(self.setreg_path, enable_val)
|
||||||
|
|
||||||
|
def _settimeout(self, seconds):
|
||||||
|
"""
|
||||||
|
Set watchdog timer timeout
|
||||||
|
@param seconds - timeout in seconds
|
||||||
|
@return is the actual set timeout
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.mmc_v <= V06_MMC_VERSION:
|
||||||
|
timeout_hex = V06_WDT_WIDTH_SELECTOR.get(seconds)
|
||||||
|
set_timeout_val = '{} {}'.format(V06_WDT_WIDTH, timeout_hex)
|
||||||
|
self._write_reg(self.setreg_path, set_timeout_val)
|
||||||
|
|
||||||
|
else:
|
||||||
|
(l, m, h) = self._seconds_to_lmh_hex(seconds)
|
||||||
|
set_h_val = '{} {}'.format(WDT_TIMER_H_BIT_REG, h)
|
||||||
|
set_m_val = '{} {}'.format(WDT_TIMER_M_BIT_REG, m)
|
||||||
|
set_l_val = '{} {}'.format(WDT_TIMER_L_BIT_REG, l)
|
||||||
|
self._write_reg(self.setreg_path, set_h_val) # set high bit
|
||||||
|
self._write_reg(self.setreg_path, set_m_val) # set med bit
|
||||||
|
self._write_reg(self.setreg_path, set_l_val) # set low bit
|
||||||
|
|
||||||
|
return seconds
|
||||||
|
|
||||||
|
#################################################################
|
||||||
|
|
||||||
|
def arm(self, seconds):
|
||||||
|
"""
|
||||||
|
Arm the hardware watchdog with a timeout of <seconds> seconds.
|
||||||
|
If the watchdog is currently armed, calling this function will
|
||||||
|
simply reset the timer to the provided value. If the underlying
|
||||||
|
hardware does not support the value provided in <seconds>, this
|
||||||
|
method should arm the watchdog with the *next greater* available
|
||||||
|
value.
|
||||||
|
Returns:
|
||||||
|
An integer specifying the *actual* number of seconds the watchdog
|
||||||
|
was armed with. On failure returns -1.
|
||||||
|
"""
|
||||||
|
ret = WDT_COMMON_ERROR
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.timeout != seconds:
|
||||||
|
self.timeout = self._settimeout(seconds)
|
||||||
|
|
||||||
|
if self.armed:
|
||||||
|
self._keepalive()
|
||||||
|
else:
|
||||||
|
self._enable()
|
||||||
|
self.armed = True
|
||||||
|
|
||||||
|
ret = self.timeout
|
||||||
|
self.arm_timestamp = time.time()
|
||||||
|
except IOError as e:
|
||||||
|
log_error("Error: unable to enable wdt due to : {}".format(e))
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def disarm(self):
|
||||||
|
"""
|
||||||
|
Disarm the hardware watchdog
|
||||||
|
Returns:
|
||||||
|
A boolean, True if watchdog is disarmed successfully, False if not
|
||||||
|
"""
|
||||||
|
disarmed = False
|
||||||
|
try:
|
||||||
|
self._disable()
|
||||||
|
self.armed = False
|
||||||
|
disarmed = True
|
||||||
|
except IOError as e:
|
||||||
|
log_error("Error: unable to disable wdt due to : {}".format(e))
|
||||||
|
return disarmed
|
||||||
|
|
||||||
|
def is_armed(self):
|
||||||
|
"""
|
||||||
|
Retrieves the armed state of the hardware watchdog.
|
||||||
|
Returns:
|
||||||
|
A boolean, True if watchdog is armed, False if not
|
||||||
|
"""
|
||||||
|
return self.armed
|
||||||
|
|
||||||
|
# ========================== Signal Handling ==========================
|
||||||
|
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
global CPUWDT_MAIN_TASK_RUNNING_FLAG
|
||||||
|
if sig == signal.SIGHUP:
|
||||||
|
log_info("Caught SIGHUP - ignoring...")
|
||||||
|
return
|
||||||
|
elif sig == signal.SIGINT:
|
||||||
|
log_info("Caught SIGINT - exiting...")
|
||||||
|
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
|
||||||
|
elif sig == signal.SIGTERM:
|
||||||
|
log_info("Caught SIGTERM - exiting...")
|
||||||
|
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
|
||||||
|
else:
|
||||||
|
log_warning("Caught unhandled signal '" + sig + "'")
|
||||||
|
return
|
||||||
|
|
||||||
|
#
|
||||||
|
# Main =========================================================================
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def check_cpld_driver():
|
||||||
|
# Check the cpld driver loading status.
|
||||||
|
cpld_setreg = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)
|
||||||
|
|
||||||
|
c = 0
|
||||||
|
while c < 60:
|
||||||
|
if os.path.isfile(cpld_setreg):
|
||||||
|
return
|
||||||
|
c += 1
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
print("Error: The cpld driver has not been loaded.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Register our signal handlers
|
||||||
|
signal.signal(signal.SIGHUP, signal_handler)
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
# Init argument parser
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("action", help="Start/Stop CPU WDT",
|
||||||
|
choices=['start', 'stop'])
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout", "-t", help="WDT timeout period", choices=[30, 60, 180], type=int)
|
||||||
|
parser.add_argument("--keep_alive", "-k",
|
||||||
|
help="WDT keep alive period", type=int)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check the cpld driver
|
||||||
|
check_cpld_driver()
|
||||||
|
|
||||||
|
# Init WDT Class
|
||||||
|
watchdog = Watchdog()
|
||||||
|
|
||||||
|
if args.action == 'start':
|
||||||
|
log_info('Enable CPU WDT..')
|
||||||
|
|
||||||
|
# Enable
|
||||||
|
timeout = args.timeout or DEFAULT_TIMEOUT
|
||||||
|
watchdog.arm(timeout)
|
||||||
|
log_info('CPU WDT has been enabled with {} seconds timeout'.format(timeout))
|
||||||
|
|
||||||
|
# Keep Alive
|
||||||
|
keep_alive = args.keep_alive or DEFAULT_KEEPALIVE
|
||||||
|
log_info('Enable keep alive messaging every {} seconds'.format(keep_alive))
|
||||||
|
while CPUWDT_MAIN_TASK_RUNNING_FLAG:
|
||||||
|
time.sleep(keep_alive-1)
|
||||||
|
watchdog.arm(timeout)
|
||||||
|
log_info('Keep alive messaging has been disabled')
|
||||||
|
|
||||||
|
# Disable
|
||||||
|
log_info('Disable CPU WDT..')
|
||||||
|
watchdog.disarm()
|
||||||
|
log_info('CPU WDT has been disabled!')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,10 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=CPU WDT
|
||||||
|
After=platform-modules-haliburton.service
|
||||||
|
Requires=platform-modules-haliburton.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=-/usr/local/bin/cpu_wdt start
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
Reference in New Issue
Block a user