From 53373b63830d71518419826b38bf2044e089eb10 Mon Sep 17 00:00:00 2001 From: Qi Luo Date: Mon, 4 Jun 2018 21:06:58 -0700 Subject: [PATCH] Revert the feature: serial port watchdog service (#1766) * Revert "[serial watchdog] remove serial watchdog service dependency to rc.local (#1752)" * Revert "[service] introducing serial port watchdog service (#1743)" --- .../build_templates/sonic_debian_extension.j2 | 5 - files/image_config/platform/rc.local | 44 +-- .../serial-port-watchdog.py | 328 ------------------ .../serial-port-watchdog.service | 10 - 4 files changed, 4 insertions(+), 383 deletions(-) delete mode 100755 files/image_config/serial-port-watchdog/serial-port-watchdog.py delete mode 100644 files/image_config/serial-port-watchdog/serial-port-watchdog.service diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 301ae938a882..0849118b83a4 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -154,11 +154,6 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/ -# Copy serial-port-watchdog configuration scripts -sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/ -sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service -sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/ - # Copy updategraph script and service file sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/ sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service diff --git a/files/image_config/platform/rc.local b/files/image_config/platform/rc.local index ab88b7a3f5b4..de54d141ef7e 100755 --- a/files/image_config/platform/rc.local +++ b/files/image_config/platform/rc.local @@ -183,51 +183,15 @@ for x in "$@"; do done } -setup_platform() -{ +eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ") + +if [ -f /host/image-$sonic_version/platform/firsttime ]; then + if [ -n "$aboot_platform" ]; then platform=$aboot_platform elif [ -n "$onie_platform" ]; then platform=$onie_platform else - platform='' - fi -} - -# Setup default values in this function before reading installer.conf -# installer.conf could override the value set in this function. -setup_platform_defaults() -{ - # Default serial port: ttyS0 - CONSOLE_DEV=0 -} - -load_platform_installer_config() -{ - INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf - if [ -f $INSTALLER_CFG ]; then - . $INSTALLER_CFG - fi -} - -program_serial_port() -{ - sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service - systemctl daemon-reload - systemctl restart serial-port-watchdog.service -} - -eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ") - -setup_platform -setup_platform_defaults -load_platform_installer_config - -program_serial_port - -if [ -f /host/image-$sonic_version/platform/firsttime ]; then - - if [ -z "$platform" ]; then echo "Unknown sonic platform" firsttime_exit fi diff --git a/files/image_config/serial-port-watchdog/serial-port-watchdog.py b/files/image_config/serial-port-watchdog/serial-port-watchdog.py deleted file mode 100755 index 15c57556d065..000000000000 --- a/files/image_config/serial-port-watchdog/serial-port-watchdog.py +++ /dev/null @@ -1,328 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function, with_statement - -import argparse -import logging -import logging.handlers -import os -import time -import signal -import socket -import sys - -from collections import namedtuple - -PRGNAME = 'serial-port-watchdog' - -DEVFS_PATH = '/dev' -PROCFS_PATH = '/proc' - -# According to procfs(5) -ProcStat = namedtuple( 'ProcStat', [ - 'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid', - 'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime', - 'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue', - 'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode', - 'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore', - 'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor', - 'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time', - 'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start', - 'arg_end', 'env_start', 'env_end', 'exit_code' -] ) - -# According to procfs(5) -ProcIo = namedtuple( 'ProcIo', [ - 'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', - 'cancelled_write_bytes' -] ) - -class Process( object ): - def __init__( self, pid, path=PROCFS_PATH ): - self.pid = pid - self.path = os.path.join( path, str( pid ) ) - self.childs = [] - self.parent = None - - self.stat = None - - self.io = None - self.stack = None - self.stackStartTime = None - - def refresh( self ): - with open( os.path.join( self.path, 'stat' ) ) as f: - data = f.read().rstrip().split() - self.stat = ProcStat( *data ) - - def getStat( self, key=None ): - self.refresh() - return self.stat - - def uid( self ): - return '%s/%s' % ( self.pid, self.stat.starttime ) - - def ppid( self ): - return self.stat.ppid - - def name( self ): - with open( os.path.join( self.path, 'comm' ) ) as f: - return f.read().rstrip() - - def getTtyForFd( self, fd ): - path = os.path.join( self.path, 'fd', str( fd ) ) - if not os.path.exists( path ): - return '' - return os.readlink( path ) - - def getStack( self ): - with open( os.path.join( self.path, 'stack' ) ) as f: - return f.read() - - def getIo( self ): - with open( os.path.join( self.path, 'io' ) ) as f: - data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ] - return ProcIo( *data ) - - def isUsingTty( self, tty ): - return self.getTtyForFd( 0 ).endswith( tty ) - - def checkStuck( self, content ): - stack = self.getStack() - - found = False - for match in content: - if match in stack: - found = True - break - - if not found: - self.io = None - self.stack = None - self.stackStartTime = None - return 0 - - io = self.getIo() - - if self.stack != stack or self.io != io: - self.io = io - self.stack = stack - self.stackStartTime = time.time() - return 0 - - return time.time() - self.stackStartTime - - def __repr__( self ): - return '' % self.uid() - -class ProcessMonitor( object ): - def __init__( self, path=PROCFS_PATH ): - self.path = path - self.procs = {} - self.filters = [] - self.checkers = [] - self.whitelist = [] - - def addProcessFilter( self, func, *args ): - self.filters.append( ( func, args ) ) - - def addStuckChecker( self, func, *args ): - self.checkers.append( ( func, args ) ) - - def setWhitelist( self, whitelist ): - self.whitelist = whitelist - - def shouldHandleProcess( self, proc ): - matched = False - for func, args in self.filters: - if func( proc, *args ): - matched = True - break - - if not matched: - return False - - name = proc.name() - for item in self.whitelist: - if item in name: - return False - - return True - - def getRunningPids( self ): - pids = [] - for entry in os.listdir( self.path ): - if not entry.isdigit(): - continue - pids.append( int( entry ) ) - return pids - - def killStuckProcess( self, proc, elapsed, kill, timeout ): - if not elapsed: - return - - if elapsed < timeout: - if elapsed > timeout / 2: - logging.info( 'process %d seems stuck, idle for %ds, waiting ' - 'some more time', proc.pid, elapsed ) - return - - logging.warning( 'process %d has been stuck for %d seconds, killing...', - proc.pid, elapsed ) - logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack ) - if kill: - # XXX: SIGTERM sleep then if alive SIGKILL ? - os.kill( proc.pid, signal.SIGKILL ) - - def killStuckProcesses( self, kill, timeout ): - for proc in self.procs.values(): - for checker, args in self.checkers: - elapsed = checker( proc, *args ) - self.killStuckProcess( proc, elapsed, kill, timeout ) - - def updatePid( self, pid ): - p = Process( pid ) - - # if the process is already monitored (previously running) - r = self.procs.get( pid, None ) - if r: - p.refresh() - # if the process is still running - if p.uid() == r.uid(): - logging.debug( 'process %d still running', pid ) - return - # or the pid was reused but the process is different - logging.debug( 'pid %d reused for another process', pid ) - del self.procs[ pid ] - - # check if the process is relevant for monitoring - if not self.shouldHandleProcess( p ): - return - - logging.debug( 'watching process %d', pid ) - p.refresh() - self.procs[ pid ] = p - - def updateParenting( self ): - # clear parent and childs for monitored processes - for proc in self.procs.values(): - del proc.childs[:] - proc.parent = None - - # set parent and childs for monitored processes - for proc in self.procs.values(): - ppid = proc.ppid() - parent = self.procs.get( ppid, None ) - if parent: - proc.parent = parent - parent.childs.append( proc ) - - def update( self ): - pids = self.getRunningPids() - - # remove defunct processes - for pid in list(self.procs.keys()): - if pid not in pids: - logging.debug( 'process %d is defunct', pid ) - del self.procs[ pid ] - - # create or update running processes information - for pid in pids: - try: - self.updatePid( pid ) - except: - logging.warning( 'An issue occured whileupdating process %s', - pid ) - raise - - #self.updateParenting() - -def checkRootPermissions(): - if os.geteuid() != 0: - logging.error( 'You must be root to use this feature' ) - sys.exit( 1 ) - -def getHostname(): - try: - return socket.gethostname() - except: - return 'localhost' - -def setupLogging( verbose=False ): - loglevel = logging.DEBUG if verbose else logging.INFO - dateFmt = '%Y-%m-%d %H:%M:%S' - - log = logging.getLogger() - log.setLevel( logging.DEBUG ) - - logOut = logging.StreamHandler( sys.stdout ) - logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) ) - logOut.setLevel( loglevel ) - log.addHandler( logOut ) - - logSys = logging.handlers.SysLogHandler() - # format to rfc5424 format - fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME ) - logSys.setFormatter( logging.Formatter( fmt ) ) - logSys.setLevel( logging.WARNING ) - log.addHandler( logSys ) - try: - # the connection to the syslog socket happens with the first message - log.info( 'Attaching to syslog' ) - except: - log.warning( 'Failed open syslog' ) - -def listParser( value ): - if not value.strip(): - return [] - return value.split( ',' ) - -def ttyParser( dev, path=DEVFS_PATH ): - if not dev.startswith( DEVFS_PATH ): - dev = os.path.join( DEVFS_PATH, dev ) - if not os.path.exists( dev ): - raise argparse.ArgumentTypeError( '%s is not a device' % dev ) - return dev - -def parseArgs( args ): - parser = argparse.ArgumentParser() - - parser.add_argument( '-d', '--dry-run', action='store_true', - help='only print processes that would be killed' ) - parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser, - help='functions to look for in the stack trace' ) - parser.add_argument( '-i', '--interval', default=60, type=float, - help='interval at which to check the procfs' ) - parser.add_argument( '-k', '--timeout', default=3600, type=float, - help='timeout for which a process gets killed' ) - parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser, - help='tty to check for stuck process' ) - parser.add_argument( '-v', '--verbose', action='store_true', - help='print all debug messages' ) - parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser, - help='whitelist programs that should never be killed' ) - - return parser.parse_args( args ) - -def main( args ): - args = parseArgs( args ) - - setupLogging( args.verbose ) - checkRootPermissions() - - m = ProcessMonitor() - m.addProcessFilter( Process.isUsingTty, args.tty ) - m.addStuckChecker( Process.checkStuck, args.funcs ) - m.setWhitelist( args.whitelist ) - - while True: - logging.debug( 'updating processes' ) - m.update() - m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout ) - time.sleep( args.interval ) - - return 0 - -if __name__ == '__main__': - sys.exit( main( sys.argv[ 1: ] ) ) - diff --git a/files/image_config/serial-port-watchdog/serial-port-watchdog.service b/files/image_config/serial-port-watchdog/serial-port-watchdog.service deleted file mode 100644 index 1fcb15a103a5..000000000000 --- a/files/image_config/serial-port-watchdog/serial-port-watchdog.service +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=Monitor serial port processes, kill stuck ones - -[Service] -ExecStart=/usr/bin/serial-port-watchdog.py -t ttyS0 -Restart=always -RestartSec=0 - -[Install] -WantedBy=multi-user.target