[service] introducing serial port watchdog service (#1743)

* [rc.local] refactor platform identification code to separate function

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [rc.local] infrastructure to take action according to installer.conf

* [serial port watchdog] add service to watch serial port processes

Monitor serial port processes. Kill ones stuck for too long.

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [rc.local] start watchdog on serial port specified by installer.conf

Signed-off-by: Ying Xie <ying.xie@microsoft.com>
This commit is contained in:
Ying Xie 2018-05-25 10:52:35 -07:00 committed by GitHub
parent d165a5030a
commit bb6ff62a32
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 385 additions and 4 deletions

View File

@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/
# Copy serial-port-watchdog configuration scripts
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/
# Copy updategraph script and service file
sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service

View File

@ -183,15 +183,51 @@ for x in "$@"; do
done
}
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
setup_platform()
{
if [ -n "$aboot_platform" ]; then
platform=$aboot_platform
elif [ -n "$onie_platform" ]; then
platform=$onie_platform
else
platform=''
fi
}
# Setup default values in this function before reading installer.conf
# installer.conf could override the value set in this function.
setup_platform_defaults()
{
# Default serial port: ttyS0
CONSOLE_DEV=0
}
load_platform_installer_config()
{
INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf
if [ -f $INSTALLER_CFG ]; then
. $INSTALLER_CFG
fi
}
program_serial_port()
{
sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service
systemctl daemon-reload
systemctl restart serial-port-watchdog.service
}
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
setup_platform
setup_platform_defaults
load_platform_installer_config
program_serial_port
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
if [ -z "$platform" ]; then
echo "Unknown sonic platform"
firsttime_exit
fi

View File

@ -0,0 +1,328 @@
#!/usr/bin/env python
from __future__ import print_function, with_statement
import argparse
import logging
import logging.handlers
import os
import time
import signal
import socket
import sys
from collections import namedtuple
PRGNAME = 'serial-port-watchdog'
DEVFS_PATH = '/dev'
PROCFS_PATH = '/proc'
# According to procfs(5)
ProcStat = namedtuple( 'ProcStat', [
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid',
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime',
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue',
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode',
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor',
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time',
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start',
'arg_end', 'env_start', 'env_end', 'exit_code'
] )
# According to procfs(5)
ProcIo = namedtuple( 'ProcIo', [
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes',
'cancelled_write_bytes'
] )
class Process( object ):
def __init__( self, pid, path=PROCFS_PATH ):
self.pid = pid
self.path = os.path.join( path, str( pid ) )
self.childs = []
self.parent = None
self.stat = None
self.io = None
self.stack = None
self.stackStartTime = None
def refresh( self ):
with open( os.path.join( self.path, 'stat' ) ) as f:
data = f.read().rstrip().split()
self.stat = ProcStat( *data )
def getStat( self, key=None ):
self.refresh()
return self.stat
def uid( self ):
return '%s/%s' % ( self.pid, self.stat.starttime )
def ppid( self ):
return self.stat.ppid
def name( self ):
with open( os.path.join( self.path, 'comm' ) ) as f:
return f.read().rstrip()
def getTtyForFd( self, fd ):
path = os.path.join( self.path, 'fd', str( fd ) )
if not os.path.exists( path ):
return ''
return os.readlink( path )
def getStack( self ):
with open( os.path.join( self.path, 'stack' ) ) as f:
return f.read()
def getIo( self ):
with open( os.path.join( self.path, 'io' ) ) as f:
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ]
return ProcIo( *data )
def isUsingTty( self, tty ):
return self.getTtyForFd( 0 ).endswith( tty )
def checkStuck( self, content ):
stack = self.getStack()
found = False
for match in content:
if match in stack:
found = True
break
if not found:
self.io = None
self.stack = None
self.stackStartTime = None
return 0
io = self.getIo()
if self.stack != stack or self.io != io:
self.io = io
self.stack = stack
self.stackStartTime = time.time()
return 0
return time.time() - self.stackStartTime
def __repr__( self ):
return '<Process uid=%s>' % self.uid()
class ProcessMonitor( object ):
def __init__( self, path=PROCFS_PATH ):
self.path = path
self.procs = {}
self.filters = []
self.checkers = []
self.whitelist = []
def addProcessFilter( self, func, *args ):
self.filters.append( ( func, args ) )
def addStuckChecker( self, func, *args ):
self.checkers.append( ( func, args ) )
def setWhitelist( self, whitelist ):
self.whitelist = whitelist
def shouldHandleProcess( self, proc ):
matched = False
for func, args in self.filters:
if func( proc, *args ):
matched = True
break
if not matched:
return False
name = proc.name()
for item in self.whitelist:
if item in name:
return False
return True
def getRunningPids( self ):
pids = []
for entry in os.listdir( self.path ):
if not entry.isdigit():
continue
pids.append( int( entry ) )
return pids
def killStuckProcess( self, proc, elapsed, kill, timeout ):
if not elapsed:
return
if elapsed < timeout:
if elapsed > timeout / 2:
logging.info( 'process %d seems stuck, idle for %ds, waiting '
'some more time', proc.pid, elapsed )
return
logging.warning( 'process %d has been stuck for %d seconds, killing...',
proc.pid, elapsed )
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack )
if kill:
# XXX: SIGTERM sleep then if alive SIGKILL ?
os.kill( proc.pid, signal.SIGKILL )
def killStuckProcesses( self, kill, timeout ):
for proc in self.procs.values():
for checker, args in self.checkers:
elapsed = checker( proc, *args )
self.killStuckProcess( proc, elapsed, kill, timeout )
def updatePid( self, pid ):
p = Process( pid )
# if the process is already monitored (previously running)
r = self.procs.get( pid, None )
if r:
p.refresh()
# if the process is still running
if p.uid() == r.uid():
logging.debug( 'process %d still running', pid )
return
# or the pid was reused but the process is different
logging.debug( 'pid %d reused for another process', pid )
del self.procs[ pid ]
# check if the process is relevant for monitoring
if not self.shouldHandleProcess( p ):
return
logging.debug( 'watching process %d', pid )
p.refresh()
self.procs[ pid ] = p
def updateParenting( self ):
# clear parent and childs for monitored processes
for proc in self.procs.values():
del proc.childs[:]
proc.parent = None
# set parent and childs for monitored processes
for proc in self.procs.values():
ppid = proc.ppid()
parent = self.procs.get( ppid, None )
if parent:
proc.parent = parent
parent.childs.append( proc )
def update( self ):
pids = self.getRunningPids()
# remove defunct processes
for pid in list(self.procs.keys()):
if pid not in pids:
logging.debug( 'process %d is defunct', pid )
del self.procs[ pid ]
# create or update running processes information
for pid in pids:
try:
self.updatePid( pid )
except:
logging.warning( 'An issue occured whileupdating process %s',
pid )
raise
#self.updateParenting()
def checkRootPermissions():
if os.geteuid() != 0:
logging.error( 'You must be root to use this feature' )
sys.exit( 1 )
def getHostname():
try:
return socket.gethostname()
except:
return 'localhost'
def setupLogging( verbose=False ):
loglevel = logging.DEBUG if verbose else logging.INFO
dateFmt = '%Y-%m-%d %H:%M:%S'
log = logging.getLogger()
log.setLevel( logging.DEBUG )
logOut = logging.StreamHandler( sys.stdout )
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) )
logOut.setLevel( loglevel )
log.addHandler( logOut )
logSys = logging.handlers.SysLogHandler()
# format to rfc5424 format
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME )
logSys.setFormatter( logging.Formatter( fmt ) )
logSys.setLevel( logging.WARNING )
log.addHandler( logSys )
try:
# the connection to the syslog socket happens with the first message
log.info( 'Attaching to syslog' )
except:
log.warning( 'Failed open syslog' )
def listParser( value ):
if not value.strip():
return []
return value.split( ',' )
def ttyParser( dev, path=DEVFS_PATH ):
if not dev.startswith( DEVFS_PATH ):
dev = os.path.join( DEVFS_PATH, dev )
if not os.path.exists( dev ):
raise argparse.ArgumentTypeError( '%s is not a device' % dev )
return dev
def parseArgs( args ):
parser = argparse.ArgumentParser()
parser.add_argument( '-d', '--dry-run', action='store_true',
help='only print processes that would be killed' )
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser,
help='functions to look for in the stack trace' )
parser.add_argument( '-i', '--interval', default=60, type=float,
help='interval at which to check the procfs' )
parser.add_argument( '-k', '--timeout', default=3600, type=float,
help='timeout for which a process gets killed' )
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser,
help='tty to check for stuck process' )
parser.add_argument( '-v', '--verbose', action='store_true',
help='print all debug messages' )
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser,
help='whitelist programs that should never be killed' )
return parser.parse_args( args )
def main( args ):
args = parseArgs( args )
setupLogging( args.verbose )
checkRootPermissions()
m = ProcessMonitor()
m.addProcessFilter( Process.isUsingTty, args.tty )
m.addStuckChecker( Process.checkStuck, args.funcs )
m.setWhitelist( args.whitelist )
while True:
logging.debug( 'updating processes' )
m.update()
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout )
time.sleep( args.interval )
return 0
if __name__ == '__main__':
sys.exit( main( sys.argv[ 1: ] ) )

View File

@ -0,0 +1,12 @@
[Unit]
Description=Monitor serial port processes, kill stuck ones
Requires=
After=rc.local.Service
[Service]
ExecStart=/usr/bin/serial-port-watchdog.py -t ttyS0
Restart=always
RestartSec=0
[Install]
WantedBy=multi-user.target