[service] introducing serial port watchdog service (#1743)
* [rc.local] refactor platform identification code to separate function Signed-off-by: Ying Xie <ying.xie@microsoft.com> * [rc.local] infrastructure to take action according to installer.conf * [serial port watchdog] add service to watch serial port processes Monitor serial port processes. Kill ones stuck for too long. Signed-off-by: Ying Xie <ying.xie@microsoft.com> * [rc.local] start watchdog on serial port specified by installer.conf Signed-off-by: Ying Xie <ying.xie@microsoft.com>
This commit is contained in:
parent
d165a5030a
commit
bb6ff62a32
@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service
|
||||
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/
|
||||
|
||||
# Copy serial-port-watchdog configuration scripts
|
||||
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service
|
||||
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/
|
||||
|
||||
# Copy updategraph script and service file
|
||||
sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service
|
||||
|
@ -183,15 +183,51 @@ for x in "$@"; do
|
||||
done
|
||||
}
|
||||
|
||||
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
|
||||
|
||||
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
|
||||
|
||||
setup_platform()
|
||||
{
|
||||
if [ -n "$aboot_platform" ]; then
|
||||
platform=$aboot_platform
|
||||
elif [ -n "$onie_platform" ]; then
|
||||
platform=$onie_platform
|
||||
else
|
||||
platform=''
|
||||
fi
|
||||
}
|
||||
|
||||
# Setup default values in this function before reading installer.conf
|
||||
# installer.conf could override the value set in this function.
|
||||
setup_platform_defaults()
|
||||
{
|
||||
# Default serial port: ttyS0
|
||||
CONSOLE_DEV=0
|
||||
}
|
||||
|
||||
load_platform_installer_config()
|
||||
{
|
||||
INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf
|
||||
if [ -f $INSTALLER_CFG ]; then
|
||||
. $INSTALLER_CFG
|
||||
fi
|
||||
}
|
||||
|
||||
program_serial_port()
|
||||
{
|
||||
sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service
|
||||
systemctl daemon-reload
|
||||
systemctl restart serial-port-watchdog.service
|
||||
}
|
||||
|
||||
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
|
||||
|
||||
setup_platform
|
||||
setup_platform_defaults
|
||||
load_platform_installer_config
|
||||
|
||||
program_serial_port
|
||||
|
||||
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
|
||||
|
||||
if [ -z "$platform" ]; then
|
||||
echo "Unknown sonic platform"
|
||||
firsttime_exit
|
||||
fi
|
||||
|
328
files/image_config/serial-port-watchdog/serial-port-watchdog.py
Executable file
328
files/image_config/serial-port-watchdog/serial-port-watchdog.py
Executable file
@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import print_function, with_statement
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
import time
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
PRGNAME = 'serial-port-watchdog'
|
||||
|
||||
DEVFS_PATH = '/dev'
|
||||
PROCFS_PATH = '/proc'
|
||||
|
||||
# According to procfs(5)
|
||||
ProcStat = namedtuple( 'ProcStat', [
|
||||
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid',
|
||||
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime',
|
||||
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue',
|
||||
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode',
|
||||
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
|
||||
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor',
|
||||
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time',
|
||||
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start',
|
||||
'arg_end', 'env_start', 'env_end', 'exit_code'
|
||||
] )
|
||||
|
||||
# According to procfs(5)
|
||||
ProcIo = namedtuple( 'ProcIo', [
|
||||
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes',
|
||||
'cancelled_write_bytes'
|
||||
] )
|
||||
|
||||
class Process( object ):
|
||||
def __init__( self, pid, path=PROCFS_PATH ):
|
||||
self.pid = pid
|
||||
self.path = os.path.join( path, str( pid ) )
|
||||
self.childs = []
|
||||
self.parent = None
|
||||
|
||||
self.stat = None
|
||||
|
||||
self.io = None
|
||||
self.stack = None
|
||||
self.stackStartTime = None
|
||||
|
||||
def refresh( self ):
|
||||
with open( os.path.join( self.path, 'stat' ) ) as f:
|
||||
data = f.read().rstrip().split()
|
||||
self.stat = ProcStat( *data )
|
||||
|
||||
def getStat( self, key=None ):
|
||||
self.refresh()
|
||||
return self.stat
|
||||
|
||||
def uid( self ):
|
||||
return '%s/%s' % ( self.pid, self.stat.starttime )
|
||||
|
||||
def ppid( self ):
|
||||
return self.stat.ppid
|
||||
|
||||
def name( self ):
|
||||
with open( os.path.join( self.path, 'comm' ) ) as f:
|
||||
return f.read().rstrip()
|
||||
|
||||
def getTtyForFd( self, fd ):
|
||||
path = os.path.join( self.path, 'fd', str( fd ) )
|
||||
if not os.path.exists( path ):
|
||||
return ''
|
||||
return os.readlink( path )
|
||||
|
||||
def getStack( self ):
|
||||
with open( os.path.join( self.path, 'stack' ) ) as f:
|
||||
return f.read()
|
||||
|
||||
def getIo( self ):
|
||||
with open( os.path.join( self.path, 'io' ) ) as f:
|
||||
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ]
|
||||
return ProcIo( *data )
|
||||
|
||||
def isUsingTty( self, tty ):
|
||||
return self.getTtyForFd( 0 ).endswith( tty )
|
||||
|
||||
def checkStuck( self, content ):
|
||||
stack = self.getStack()
|
||||
|
||||
found = False
|
||||
for match in content:
|
||||
if match in stack:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
self.io = None
|
||||
self.stack = None
|
||||
self.stackStartTime = None
|
||||
return 0
|
||||
|
||||
io = self.getIo()
|
||||
|
||||
if self.stack != stack or self.io != io:
|
||||
self.io = io
|
||||
self.stack = stack
|
||||
self.stackStartTime = time.time()
|
||||
return 0
|
||||
|
||||
return time.time() - self.stackStartTime
|
||||
|
||||
def __repr__( self ):
|
||||
return '<Process uid=%s>' % self.uid()
|
||||
|
||||
class ProcessMonitor( object ):
|
||||
def __init__( self, path=PROCFS_PATH ):
|
||||
self.path = path
|
||||
self.procs = {}
|
||||
self.filters = []
|
||||
self.checkers = []
|
||||
self.whitelist = []
|
||||
|
||||
def addProcessFilter( self, func, *args ):
|
||||
self.filters.append( ( func, args ) )
|
||||
|
||||
def addStuckChecker( self, func, *args ):
|
||||
self.checkers.append( ( func, args ) )
|
||||
|
||||
def setWhitelist( self, whitelist ):
|
||||
self.whitelist = whitelist
|
||||
|
||||
def shouldHandleProcess( self, proc ):
|
||||
matched = False
|
||||
for func, args in self.filters:
|
||||
if func( proc, *args ):
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
return False
|
||||
|
||||
name = proc.name()
|
||||
for item in self.whitelist:
|
||||
if item in name:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def getRunningPids( self ):
|
||||
pids = []
|
||||
for entry in os.listdir( self.path ):
|
||||
if not entry.isdigit():
|
||||
continue
|
||||
pids.append( int( entry ) )
|
||||
return pids
|
||||
|
||||
def killStuckProcess( self, proc, elapsed, kill, timeout ):
|
||||
if not elapsed:
|
||||
return
|
||||
|
||||
if elapsed < timeout:
|
||||
if elapsed > timeout / 2:
|
||||
logging.info( 'process %d seems stuck, idle for %ds, waiting '
|
||||
'some more time', proc.pid, elapsed )
|
||||
return
|
||||
|
||||
logging.warning( 'process %d has been stuck for %d seconds, killing...',
|
||||
proc.pid, elapsed )
|
||||
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack )
|
||||
if kill:
|
||||
# XXX: SIGTERM sleep then if alive SIGKILL ?
|
||||
os.kill( proc.pid, signal.SIGKILL )
|
||||
|
||||
def killStuckProcesses( self, kill, timeout ):
|
||||
for proc in self.procs.values():
|
||||
for checker, args in self.checkers:
|
||||
elapsed = checker( proc, *args )
|
||||
self.killStuckProcess( proc, elapsed, kill, timeout )
|
||||
|
||||
def updatePid( self, pid ):
|
||||
p = Process( pid )
|
||||
|
||||
# if the process is already monitored (previously running)
|
||||
r = self.procs.get( pid, None )
|
||||
if r:
|
||||
p.refresh()
|
||||
# if the process is still running
|
||||
if p.uid() == r.uid():
|
||||
logging.debug( 'process %d still running', pid )
|
||||
return
|
||||
# or the pid was reused but the process is different
|
||||
logging.debug( 'pid %d reused for another process', pid )
|
||||
del self.procs[ pid ]
|
||||
|
||||
# check if the process is relevant for monitoring
|
||||
if not self.shouldHandleProcess( p ):
|
||||
return
|
||||
|
||||
logging.debug( 'watching process %d', pid )
|
||||
p.refresh()
|
||||
self.procs[ pid ] = p
|
||||
|
||||
def updateParenting( self ):
|
||||
# clear parent and childs for monitored processes
|
||||
for proc in self.procs.values():
|
||||
del proc.childs[:]
|
||||
proc.parent = None
|
||||
|
||||
# set parent and childs for monitored processes
|
||||
for proc in self.procs.values():
|
||||
ppid = proc.ppid()
|
||||
parent = self.procs.get( ppid, None )
|
||||
if parent:
|
||||
proc.parent = parent
|
||||
parent.childs.append( proc )
|
||||
|
||||
def update( self ):
|
||||
pids = self.getRunningPids()
|
||||
|
||||
# remove defunct processes
|
||||
for pid in list(self.procs.keys()):
|
||||
if pid not in pids:
|
||||
logging.debug( 'process %d is defunct', pid )
|
||||
del self.procs[ pid ]
|
||||
|
||||
# create or update running processes information
|
||||
for pid in pids:
|
||||
try:
|
||||
self.updatePid( pid )
|
||||
except:
|
||||
logging.warning( 'An issue occured whileupdating process %s',
|
||||
pid )
|
||||
raise
|
||||
|
||||
#self.updateParenting()
|
||||
|
||||
def checkRootPermissions():
|
||||
if os.geteuid() != 0:
|
||||
logging.error( 'You must be root to use this feature' )
|
||||
sys.exit( 1 )
|
||||
|
||||
def getHostname():
|
||||
try:
|
||||
return socket.gethostname()
|
||||
except:
|
||||
return 'localhost'
|
||||
|
||||
def setupLogging( verbose=False ):
|
||||
loglevel = logging.DEBUG if verbose else logging.INFO
|
||||
dateFmt = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
log = logging.getLogger()
|
||||
log.setLevel( logging.DEBUG )
|
||||
|
||||
logOut = logging.StreamHandler( sys.stdout )
|
||||
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) )
|
||||
logOut.setLevel( loglevel )
|
||||
log.addHandler( logOut )
|
||||
|
||||
logSys = logging.handlers.SysLogHandler()
|
||||
# format to rfc5424 format
|
||||
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME )
|
||||
logSys.setFormatter( logging.Formatter( fmt ) )
|
||||
logSys.setLevel( logging.WARNING )
|
||||
log.addHandler( logSys )
|
||||
try:
|
||||
# the connection to the syslog socket happens with the first message
|
||||
log.info( 'Attaching to syslog' )
|
||||
except:
|
||||
log.warning( 'Failed open syslog' )
|
||||
|
||||
def listParser( value ):
|
||||
if not value.strip():
|
||||
return []
|
||||
return value.split( ',' )
|
||||
|
||||
def ttyParser( dev, path=DEVFS_PATH ):
|
||||
if not dev.startswith( DEVFS_PATH ):
|
||||
dev = os.path.join( DEVFS_PATH, dev )
|
||||
if not os.path.exists( dev ):
|
||||
raise argparse.ArgumentTypeError( '%s is not a device' % dev )
|
||||
return dev
|
||||
|
||||
def parseArgs( args ):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument( '-d', '--dry-run', action='store_true',
|
||||
help='only print processes that would be killed' )
|
||||
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser,
|
||||
help='functions to look for in the stack trace' )
|
||||
parser.add_argument( '-i', '--interval', default=60, type=float,
|
||||
help='interval at which to check the procfs' )
|
||||
parser.add_argument( '-k', '--timeout', default=3600, type=float,
|
||||
help='timeout for which a process gets killed' )
|
||||
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser,
|
||||
help='tty to check for stuck process' )
|
||||
parser.add_argument( '-v', '--verbose', action='store_true',
|
||||
help='print all debug messages' )
|
||||
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser,
|
||||
help='whitelist programs that should never be killed' )
|
||||
|
||||
return parser.parse_args( args )
|
||||
|
||||
def main( args ):
|
||||
args = parseArgs( args )
|
||||
|
||||
setupLogging( args.verbose )
|
||||
checkRootPermissions()
|
||||
|
||||
m = ProcessMonitor()
|
||||
m.addProcessFilter( Process.isUsingTty, args.tty )
|
||||
m.addStuckChecker( Process.checkStuck, args.funcs )
|
||||
m.setWhitelist( args.whitelist )
|
||||
|
||||
while True:
|
||||
logging.debug( 'updating processes' )
|
||||
m.update()
|
||||
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout )
|
||||
time.sleep( args.interval )
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit( main( sys.argv[ 1: ] ) )
|
||||
|
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Monitor serial port processes, kill stuck ones
|
||||
Requires=
|
||||
After=rc.local.Service
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/serial-port-watchdog.py -t ttyS0
|
||||
Restart=always
|
||||
RestartSec=0
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
Reference in New Issue
Block a user