sonic-buildimage/platform/broadcom/sonic-platform-modules-cel/haliburton/scripts/cpu_wdt
Wirut Getbamrung a416f49676
[platform/cel-haliburton]: add watchdog service (#6259)
Haliburton needed watchdog daemon to monitor the basic health of a machine. If something goes wrong, such as a crashing program overloading the CPU, or no more free memory on the system, watchdog can safely reboot the machine,
2020-12-26 03:04:21 -08:00

322 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python
import os
import sys
import time
import signal
import syslog
import argparse
import subprocess
from sonic_platform_base.watchdog_base import WatchdogBase
SYSLOG_IDENTIFIER = 'cpu_wdt'
CPUWDT_MAIN_TASK_RUNNING_FLAG = True
PLATFORM_CPLD_PATH = '/sys/devices/platform/e1031.smc/'
SETREG_FILE = 'setreg'
GETREG_FILE = 'getreg'
WDT_COMMON_ERROR = -1
MMC_VERSION_REG = "0x100"
# watchdog infomation for cpld v06
V06_MMC_VERSION = 0x05
V06_WDT_WIDTH = '0x110'
V06_WDT_WIDTH_SELECTOR = {
30: '0x1',
60: '0x2',
180: '0x3'
}
V06_CPLD_WDT_INFO = {
'wdt_en_reg': '0x111',
'wdt_en_cmd': '0x0',
'wdt_dis_cmd': '0x1'
}
# watchdog infomation
WDT_TIMER_L_BIT_REG = '0x117'
WDT_TIMER_M_BIT_REG = '0x118'
WDT_TIMER_H_BIT_REG = '0x119'
WDT_KEEP_ALVIVE_REG = '0x11a'
CPLD_WDT_INFO = {
'wdt_en_reg': '0x116',
'wdt_en_cmd': '0x1',
'wdt_dis_cmd': '0x0'
}
# default input
DEFAULT_TIMEOUT = 180
DEFAULT_KEEPALIVE = 60
# ========================== Syslog wrappers ==========================
def log_info(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_INFO, msg)
syslog.closelog()
if also_print_to_console:
print(msg)
def log_warning(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_WARNING, msg)
syslog.closelog()
if also_print_to_console:
print(msg)
def log_error(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_ERR, msg)
syslog.closelog()
if also_print_to_console:
print(msg)
class Watchdog(WatchdogBase):
def __init__(self):
# Init cpld reg path
self.setreg_path = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)
self.getreg_path = os.path.join(PLATFORM_CPLD_PATH, GETREG_FILE)
self.mmc_v = self._get_mmc_version()
self.cpld_info = V06_CPLD_WDT_INFO if self.mmc_v <= V06_MMC_VERSION else CPLD_WDT_INFO
# Set default value
self._disable()
self.armed = False
self.timeout = 0
def _get_mmc_version(self):
hex_str_v = self._get_register_value(MMC_VERSION_REG)
return int(hex_str_v, 16)
def _get_register_value(self, register):
# Retrieves the value in the cpld register.
self._write_reg(self.getreg_path, register)
return self._read_reg(self.getreg_path)
def _write_reg(self, file_path, value):
with open(file_path, 'w') as fd:
fd.write(str(value))
def _read_reg(self, path):
with open(path, 'r') as fd:
output = fd.readline()
return output.strip('\n')
def _get_level_hex(self, sub_hex):
sub_hex_str = sub_hex.replace("x", "0")
return hex(int(sub_hex_str, 16))
def _seconds_to_lmh_hex(self, seconds):
ms = seconds*1000 # calculate timeout in ms format
hex_str = hex(ms)
l = self._get_level_hex(hex_str[-2:])
m = self._get_level_hex(hex_str[-4:-2])
h = self._get_level_hex(hex_str[-6:-4])
return (l, m, h)
def _enable(self):
"""
Turn on the watchdog timer
"""
enable_val = '{} {}'.format(
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_en_cmd'])
return self._write_reg(self.setreg_path, enable_val)
def _disable(self):
"""
Turn off the watchdog timer
"""
disable_val = '{} {}'.format(
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_dis_cmd'])
return self._write_reg(self.setreg_path, disable_val)
def _keepalive(self):
"""
Keep alive watchdog timer
"""
if self.mmc_v <= V06_MMC_VERSION:
self._disable()
self._enable()
else:
enable_val = '{} {}'.format(
WDT_KEEP_ALVIVE_REG, self.cpld_info['wdt_en_cmd'])
self._write_reg(self.setreg_path, enable_val)
def _settimeout(self, seconds):
"""
Set watchdog timer timeout
@param seconds - timeout in seconds
@return is the actual set timeout
"""
if self.mmc_v <= V06_MMC_VERSION:
timeout_hex = V06_WDT_WIDTH_SELECTOR.get(seconds)
set_timeout_val = '{} {}'.format(V06_WDT_WIDTH, timeout_hex)
self._write_reg(self.setreg_path, set_timeout_val)
else:
(l, m, h) = self._seconds_to_lmh_hex(seconds)
set_h_val = '{} {}'.format(WDT_TIMER_H_BIT_REG, h)
set_m_val = '{} {}'.format(WDT_TIMER_M_BIT_REG, m)
set_l_val = '{} {}'.format(WDT_TIMER_L_BIT_REG, l)
self._write_reg(self.setreg_path, set_h_val) # set high bit
self._write_reg(self.setreg_path, set_m_val) # set med bit
self._write_reg(self.setreg_path, set_l_val) # set low bit
return seconds
#################################################################
def arm(self, seconds):
"""
Arm the hardware watchdog with a timeout of <seconds> seconds.
If the watchdog is currently armed, calling this function will
simply reset the timer to the provided value. If the underlying
hardware does not support the value provided in <seconds>, this
method should arm the watchdog with the *next greater* available
value.
Returns:
An integer specifying the *actual* number of seconds the watchdog
was armed with. On failure returns -1.
"""
ret = WDT_COMMON_ERROR
try:
if self.timeout != seconds:
self.timeout = self._settimeout(seconds)
if self.armed:
self._keepalive()
else:
self._enable()
self.armed = True
ret = self.timeout
self.arm_timestamp = time.time()
except IOError as e:
log_error("Error: unable to enable wdt due to : {}".format(e))
return ret
def disarm(self):
"""
Disarm the hardware watchdog
Returns:
A boolean, True if watchdog is disarmed successfully, False if not
"""
disarmed = False
try:
self._disable()
self.armed = False
disarmed = True
except IOError as e:
log_error("Error: unable to disable wdt due to : {}".format(e))
return disarmed
def is_armed(self):
"""
Retrieves the armed state of the hardware watchdog.
Returns:
A boolean, True if watchdog is armed, False if not
"""
return self.armed
# ========================== Signal Handling ==========================
def signal_handler(sig, frame):
global CPUWDT_MAIN_TASK_RUNNING_FLAG
if sig == signal.SIGHUP:
log_info("Caught SIGHUP - ignoring...")
return
elif sig == signal.SIGINT:
log_info("Caught SIGINT - exiting...")
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
elif sig == signal.SIGTERM:
log_info("Caught SIGTERM - exiting...")
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
else:
log_warning("Caught unhandled signal '" + sig + "'")
return
#
# Main =========================================================================
#
def check_cpld_driver():
# Check the cpld driver loading status.
cpld_setreg = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)
c = 0
while c < 60:
if os.path.isfile(cpld_setreg):
return
c += 1
time.sleep(1)
print("Error: The cpld driver has not been loaded.")
sys.exit(1)
def main():
# Register our signal handlers
signal.signal(signal.SIGHUP, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Init argument parser
parser = argparse.ArgumentParser()
parser.add_argument("action", help="Start/Stop CPU WDT",
choices=['start', 'stop'])
parser.add_argument(
"--timeout", "-t", help="WDT timeout period", choices=[30, 60, 180], type=int)
parser.add_argument("--keep_alive", "-k",
help="WDT keep alive period", type=int)
args = parser.parse_args()
# Check the cpld driver
check_cpld_driver()
# Init WDT Class
watchdog = Watchdog()
if args.action == 'start':
log_info('Enable CPU WDT..')
# Enable
timeout = args.timeout or DEFAULT_TIMEOUT
watchdog.arm(timeout)
log_info('CPU WDT has been enabled with {} seconds timeout'.format(timeout))
# Keep Alive
keep_alive = args.keep_alive or DEFAULT_KEEPALIVE
log_info('Enable keep alive messaging every {} seconds'.format(keep_alive))
while CPUWDT_MAIN_TASK_RUNNING_FLAG:
time.sleep(keep_alive-1)
watchdog.arm(timeout)
log_info('Keep alive messaging has been disabled')
# Disable
log_info('Disable CPU WDT..')
watchdog.disarm()
log_info('CPU WDT has been disabled!')
if __name__ == '__main__':
main()