From bb6ff62a3235763f4449e05f26f5344878c48daa Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Fri, 25 May 2018 10:52:35 -0700 Subject: [PATCH] [service] introducing serial port watchdog service (#1743) * [rc.local] refactor platform identification code to separate function Signed-off-by: Ying Xie * [rc.local] infrastructure to take action according to installer.conf * [serial port watchdog] add service to watch serial port processes Monitor serial port processes. Kill ones stuck for too long. Signed-off-by: Ying Xie * [rc.local] start watchdog on serial port specified by installer.conf Signed-off-by: Ying Xie --- .../build_templates/sonic_debian_extension.j2 | 5 + files/image_config/platform/rc.local | 44 ++- .../serial-port-watchdog.py | 328 ++++++++++++++++++ .../serial-port-watchdog.service | 12 + 4 files changed, 385 insertions(+), 4 deletions(-) create mode 100755 files/image_config/serial-port-watchdog/serial-port-watchdog.py create mode 100644 files/image_config/serial-port-watchdog/serial-port-watchdog.service diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index d0afa46596..3d46f09aa6 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/ +# Copy serial-port-watchdog configuration scripts +sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/ +sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service +sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/ + # Copy updategraph script and service file sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/ sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service diff --git a/files/image_config/platform/rc.local b/files/image_config/platform/rc.local index de54d141ef..ab88b7a3f5 100755 --- a/files/image_config/platform/rc.local +++ b/files/image_config/platform/rc.local @@ -183,15 +183,51 @@ for x in "$@"; do done } -eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ") - -if [ -f /host/image-$sonic_version/platform/firsttime ]; then - +setup_platform() +{ if [ -n "$aboot_platform" ]; then platform=$aboot_platform elif [ -n "$onie_platform" ]; then platform=$onie_platform else + platform='' + fi +} + +# Setup default values in this function before reading installer.conf +# installer.conf could override the value set in this function. +setup_platform_defaults() +{ + # Default serial port: ttyS0 + CONSOLE_DEV=0 +} + +load_platform_installer_config() +{ + INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf + if [ -f $INSTALLER_CFG ]; then + . $INSTALLER_CFG + fi +} + +program_serial_port() +{ + sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service + systemctl daemon-reload + systemctl restart serial-port-watchdog.service +} + +eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ") + +setup_platform +setup_platform_defaults +load_platform_installer_config + +program_serial_port + +if [ -f /host/image-$sonic_version/platform/firsttime ]; then + + if [ -z "$platform" ]; then echo "Unknown sonic platform" firsttime_exit fi diff --git a/files/image_config/serial-port-watchdog/serial-port-watchdog.py b/files/image_config/serial-port-watchdog/serial-port-watchdog.py new file mode 100755 index 0000000000..15c57556d0 --- /dev/null +++ b/files/image_config/serial-port-watchdog/serial-port-watchdog.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python + +from __future__ import print_function, with_statement + +import argparse +import logging +import logging.handlers +import os +import time +import signal +import socket +import sys + +from collections import namedtuple + +PRGNAME = 'serial-port-watchdog' + +DEVFS_PATH = '/dev' +PROCFS_PATH = '/proc' + +# According to procfs(5) +ProcStat = namedtuple( 'ProcStat', [ + 'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid', + 'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime', + 'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue', + 'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode', + 'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore', + 'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor', + 'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time', + 'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start', + 'arg_end', 'env_start', 'env_end', 'exit_code' +] ) + +# According to procfs(5) +ProcIo = namedtuple( 'ProcIo', [ + 'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', + 'cancelled_write_bytes' +] ) + +class Process( object ): + def __init__( self, pid, path=PROCFS_PATH ): + self.pid = pid + self.path = os.path.join( path, str( pid ) ) + self.childs = [] + self.parent = None + + self.stat = None + + self.io = None + self.stack = None + self.stackStartTime = None + + def refresh( self ): + with open( os.path.join( self.path, 'stat' ) ) as f: + data = f.read().rstrip().split() + self.stat = ProcStat( *data ) + + def getStat( self, key=None ): + self.refresh() + return self.stat + + def uid( self ): + return '%s/%s' % ( self.pid, self.stat.starttime ) + + def ppid( self ): + return self.stat.ppid + + def name( self ): + with open( os.path.join( self.path, 'comm' ) ) as f: + return f.read().rstrip() + + def getTtyForFd( self, fd ): + path = os.path.join( self.path, 'fd', str( fd ) ) + if not os.path.exists( path ): + return '' + return os.readlink( path ) + + def getStack( self ): + with open( os.path.join( self.path, 'stack' ) ) as f: + return f.read() + + def getIo( self ): + with open( os.path.join( self.path, 'io' ) ) as f: + data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ] + return ProcIo( *data ) + + def isUsingTty( self, tty ): + return self.getTtyForFd( 0 ).endswith( tty ) + + def checkStuck( self, content ): + stack = self.getStack() + + found = False + for match in content: + if match in stack: + found = True + break + + if not found: + self.io = None + self.stack = None + self.stackStartTime = None + return 0 + + io = self.getIo() + + if self.stack != stack or self.io != io: + self.io = io + self.stack = stack + self.stackStartTime = time.time() + return 0 + + return time.time() - self.stackStartTime + + def __repr__( self ): + return '' % self.uid() + +class ProcessMonitor( object ): + def __init__( self, path=PROCFS_PATH ): + self.path = path + self.procs = {} + self.filters = [] + self.checkers = [] + self.whitelist = [] + + def addProcessFilter( self, func, *args ): + self.filters.append( ( func, args ) ) + + def addStuckChecker( self, func, *args ): + self.checkers.append( ( func, args ) ) + + def setWhitelist( self, whitelist ): + self.whitelist = whitelist + + def shouldHandleProcess( self, proc ): + matched = False + for func, args in self.filters: + if func( proc, *args ): + matched = True + break + + if not matched: + return False + + name = proc.name() + for item in self.whitelist: + if item in name: + return False + + return True + + def getRunningPids( self ): + pids = [] + for entry in os.listdir( self.path ): + if not entry.isdigit(): + continue + pids.append( int( entry ) ) + return pids + + def killStuckProcess( self, proc, elapsed, kill, timeout ): + if not elapsed: + return + + if elapsed < timeout: + if elapsed > timeout / 2: + logging.info( 'process %d seems stuck, idle for %ds, waiting ' + 'some more time', proc.pid, elapsed ) + return + + logging.warning( 'process %d has been stuck for %d seconds, killing...', + proc.pid, elapsed ) + logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack ) + if kill: + # XXX: SIGTERM sleep then if alive SIGKILL ? + os.kill( proc.pid, signal.SIGKILL ) + + def killStuckProcesses( self, kill, timeout ): + for proc in self.procs.values(): + for checker, args in self.checkers: + elapsed = checker( proc, *args ) + self.killStuckProcess( proc, elapsed, kill, timeout ) + + def updatePid( self, pid ): + p = Process( pid ) + + # if the process is already monitored (previously running) + r = self.procs.get( pid, None ) + if r: + p.refresh() + # if the process is still running + if p.uid() == r.uid(): + logging.debug( 'process %d still running', pid ) + return + # or the pid was reused but the process is different + logging.debug( 'pid %d reused for another process', pid ) + del self.procs[ pid ] + + # check if the process is relevant for monitoring + if not self.shouldHandleProcess( p ): + return + + logging.debug( 'watching process %d', pid ) + p.refresh() + self.procs[ pid ] = p + + def updateParenting( self ): + # clear parent and childs for monitored processes + for proc in self.procs.values(): + del proc.childs[:] + proc.parent = None + + # set parent and childs for monitored processes + for proc in self.procs.values(): + ppid = proc.ppid() + parent = self.procs.get( ppid, None ) + if parent: + proc.parent = parent + parent.childs.append( proc ) + + def update( self ): + pids = self.getRunningPids() + + # remove defunct processes + for pid in list(self.procs.keys()): + if pid not in pids: + logging.debug( 'process %d is defunct', pid ) + del self.procs[ pid ] + + # create or update running processes information + for pid in pids: + try: + self.updatePid( pid ) + except: + logging.warning( 'An issue occured whileupdating process %s', + pid ) + raise + + #self.updateParenting() + +def checkRootPermissions(): + if os.geteuid() != 0: + logging.error( 'You must be root to use this feature' ) + sys.exit( 1 ) + +def getHostname(): + try: + return socket.gethostname() + except: + return 'localhost' + +def setupLogging( verbose=False ): + loglevel = logging.DEBUG if verbose else logging.INFO + dateFmt = '%Y-%m-%d %H:%M:%S' + + log = logging.getLogger() + log.setLevel( logging.DEBUG ) + + logOut = logging.StreamHandler( sys.stdout ) + logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) ) + logOut.setLevel( loglevel ) + log.addHandler( logOut ) + + logSys = logging.handlers.SysLogHandler() + # format to rfc5424 format + fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME ) + logSys.setFormatter( logging.Formatter( fmt ) ) + logSys.setLevel( logging.WARNING ) + log.addHandler( logSys ) + try: + # the connection to the syslog socket happens with the first message + log.info( 'Attaching to syslog' ) + except: + log.warning( 'Failed open syslog' ) + +def listParser( value ): + if not value.strip(): + return [] + return value.split( ',' ) + +def ttyParser( dev, path=DEVFS_PATH ): + if not dev.startswith( DEVFS_PATH ): + dev = os.path.join( DEVFS_PATH, dev ) + if not os.path.exists( dev ): + raise argparse.ArgumentTypeError( '%s is not a device' % dev ) + return dev + +def parseArgs( args ): + parser = argparse.ArgumentParser() + + parser.add_argument( '-d', '--dry-run', action='store_true', + help='only print processes that would be killed' ) + parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser, + help='functions to look for in the stack trace' ) + parser.add_argument( '-i', '--interval', default=60, type=float, + help='interval at which to check the procfs' ) + parser.add_argument( '-k', '--timeout', default=3600, type=float, + help='timeout for which a process gets killed' ) + parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser, + help='tty to check for stuck process' ) + parser.add_argument( '-v', '--verbose', action='store_true', + help='print all debug messages' ) + parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser, + help='whitelist programs that should never be killed' ) + + return parser.parse_args( args ) + +def main( args ): + args = parseArgs( args ) + + setupLogging( args.verbose ) + checkRootPermissions() + + m = ProcessMonitor() + m.addProcessFilter( Process.isUsingTty, args.tty ) + m.addStuckChecker( Process.checkStuck, args.funcs ) + m.setWhitelist( args.whitelist ) + + while True: + logging.debug( 'updating processes' ) + m.update() + m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout ) + time.sleep( args.interval ) + + return 0 + +if __name__ == '__main__': + sys.exit( main( sys.argv[ 1: ] ) ) + diff --git a/files/image_config/serial-port-watchdog/serial-port-watchdog.service b/files/image_config/serial-port-watchdog/serial-port-watchdog.service new file mode 100644 index 0000000000..b86580ec27 --- /dev/null +++ b/files/image_config/serial-port-watchdog/serial-port-watchdog.service @@ -0,0 +1,12 @@ +[Unit] +Description=Monitor serial port processes, kill stuck ones +Requires= +After=rc.local.Service + +[Service] +ExecStart=/usr/bin/serial-port-watchdog.py -t ttyS0 +Restart=always +RestartSec=0 + +[Install] +WantedBy=multi-user.target