[Monit] Unmonitor the processes in containers which are disabled. (#5153)

We want to let Monit to unmonitor the processes in containers which are disabled in `FEATURE` table such that
Monit will not generate false alerting messages into the syslog.

Signed-off-by: Yong Zhao <yozhao@microsoft.com>
This commit is contained in:
yozhao101 2020-09-25 00:28:28 -07:00 committed by GitHub
parent b43f1129b4
commit 13cec4c486
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 149 additions and 75 deletions

View File

@ -3,5 +3,5 @@
## process list:
## redis_server
###############################################################################
check process redis_server matching "/usr/bin/redis-server"
if does not exist for 5 times within 5 cycles then alert
check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -8,20 +8,20 @@
## bgpcfgd
## bgpmon
###############################################################################
check process zebra matching "/usr/lib/frr/zebra"
if does not exist for 5 times within 5 cycles then alert
check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra"
if status != 0 for 5 times within 5 cycles then alert
check process fpmsyncd matching "fpmsyncd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd"
if status != 0 for 5 times within 5 cycles then alert
check process bgpd matching "/usr/lib/frr/bgpd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd"
if status != 0 for 5 times within 5 cycles then alert
check process staticd matching "/usr/lib/frr/staticd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd"
if status != 0 for 5 times within 5 cycles then alert
check process bgpcfgd matching "python /usr/local/bin/bgpcfgd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd"
if status != 0 for 5 times within 5 cycles then alert
check process bgpmon matching "python /usr/local/bin/bgpmon"
if does not exist for 5 times within 5 cycles then alert
check program bgp|bgpmon with path "/usr/bin/process_checker bgp python /usr/local/bin/bgpmon"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -5,11 +5,11 @@
## lldp-syncd
## lldpmgrd
###############################################################################
check process lldpd_monitor matching "lldpd: "
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:"
if status != 0 for 5 times within 5 cycles then alert
check process lldp_syncd matching "python2 -m lldp_syncd"
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd"
if status != 0 for 5 times within 5 cycles then alert
check process lldpmgrd matching "python /usr/bin/lldpmgrd"
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -11,33 +11,33 @@
## buffermgrd
## nbrmgrd
## vxlanmgrd
###############################################################################
check process orchagent matching "/usr/bin/orchagent -d /var/log/swss"
if does not exist for 5 times within 5 cycles then alert
##############################################################################
check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss"
if status != 0 for 5 times within 5 cycles then alert
check process portsyncd matching "/usr/bin/portsyncd"
if does not exist for 5 times within 5 cycles then alert
check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd"
if status != 0 for 5 times within 5 cycles then alert
check process neighsyncd matching "/usr/bin/neighsyncd"
if does not exist for 5 times within 5 cycles then alert
check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd"
if status != 0 for 5 times within 5 cycles then alert
check process vrfmgrd matching "/usr/bin/vrfmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd"
if status != 0 for 5 times within 5 cycles then alert
check process vlanmgrd matching "/usr/bin/vlanmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd"
if status != 0 for 5 times within 5 cycles then alert
check process intfmgrd matching "/usr/bin/intfmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd"
if status != 0 for 5 times within 5 cycles then alert
check process portmgrd matching "/usr/bin/portmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd"
if status != 0 for 5 times within 5 cycles then alert
check process buffermgrd matching "/usr/bin/buffermgrd -l"
if does not exist for 5 times within 5 cycles then alert
check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l"
if status != 0 for 5 times within 5 cycles then alert
check process nbrmgrd matching "/usr/bin/nbrmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd"
if status != 0 for 5 times within 5 cycles then alert
check process vxlanmgrd matching "/usr/bin/vxlanmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## sflowmgrd
###############################################################################
check process sflowmgrd matching "/usr/bin/sflowmgrd"
if does not exist for 5 times within 5 cycles then alert
check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -4,8 +4,8 @@
## snmpd
## snmpd_subagent
###############################################################################
check process snmpd matching "/usr/sbin/snmpd\s"
if does not exist for 5 times within 5 cycles then alert
check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd"
if status != 0 for 5 times within 5 cycles then alert
check process snmp_subagent matching "python3 -m sonic_ax_impl"
if does not exist for 5 times within 5 cycles then alert
check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3.6 -m sonic_ax_impl"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## restapi
###############################################################################
check process restapi matching "/usr/sbin/go-server-server"
if does not exist for 5 times within 5 cycles then alert
check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -4,8 +4,8 @@
## telemetry
## dialout_client
###############################################################################
check process telemetry matching "/usr/sbin/telemetry"
if does not exist for 5 times within 5 cycles then alert
check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry"
if status != 0 for 5 times within 5 cycles then alert
check process dialout_client matching "/usr/sbin/dialout_client_cli"
if does not exist for 5 times within 5 cycles then alert
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -0,0 +1,11 @@
###############################################################################
## Monit configuration for teamd container
## process list:
## teamsyncd
## teammgrd
###############################################################################
check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd"
if status != 0 for 5 times within 5 cycles then alert
check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -116,6 +116,9 @@ sudo rm -rf $FILESYSTEM_ROOT/$REDIS_DUMP_LOAD_PY2_WHEEL_NAME
# Install Python module for ipaddress
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install ipaddress
# Install Python module for psutil
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install psutil
# Install SwSS SDK Python 3 package
# Note: the scripts will be overwritten by corresponding Python 2 package
if [ -e {{swsssdk_py3_wheel_path}} ]; then
@ -239,6 +242,8 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/
sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc
sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/
sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/*
sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
# Copy crontabs
sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/

View File

@ -0,0 +1,57 @@
#!/usr/bin/python
import argparse
import sys
import syslog
import psutil
import swsssdk
def check_process_existence(container_name, process_cmdline):
"""
@summary: Check whether the process in the specified container is running or not and
an alerting message will written into syslog if it failed to run.
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")
if container_name in feature_table.keys():
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
# If the container is diabled, we exit.
if ("state" in feature_table[container_name].keys()
and feature_table[container_name]["state"] == "disabled"):
sys.exit(0)
else:
# We leveraged the psutil library to help us check whether the process is running or not.
# If the process entity is found in process tree and it is also in the 'running' or 'sleeping'
# state, then it will be marked as 'running'.
is_running = False
for process in psutil.process_iter(["cmdline", "status"]):
if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]):
is_running = True
break
if not is_running:
# If this script is run by Monit, then the following output will be appended to
# Monit's syslog message.
print("'{}' is not running.".format(process_cmdline))
sys.exit(1)
else:
syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!"
.format(container_name))
def main():
parser = argparse.ArgumentParser(description="Check whether the process in the specified \
container is running and an alerting message will be written into syslog if it \
failed to run.", usage="/usr/bin/process_checker <container_name> <process_cmdline>")
parser.add_argument("container_name", help="container name")
parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process command line")
args = parser.parse_args()
check_process_existence(args.container_name, ' '.join(args.process_cmdline))
if __name__ == '__main__':
main()

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -4,8 +4,8 @@
## syncd
## dsserve
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd"
if does not exist for 5 times within 5 cycles then alert
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -4,8 +4,8 @@
## syncd
## dsserve
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd"
if does not exist for 5 times within 5 cycles then alert
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert

View File

@ -27,4 +27,5 @@ $(DOCKER_TEAMD)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_TEAMD)_RUN_OPT += -v /host/warmboot:/var/warmboot
$(DOCKER_TEAMD)_BASE_IMAGE_FILES += teamdctl:/usr/bin/teamdctl
$(DOCKER_TEAMD)_BASE_IMAGE_FILES += monit_teamd:/etc/monit/conf.d
$(DOCKER_TEAMD)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)