From 13cec4c48609ca389155b77c9351814d9d2203ef Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Fri, 25 Sep 2020 00:28:28 -0700 Subject: [PATCH] [Monit] Unmonitor the processes in containers which are disabled. (#5153) We want to let Monit to unmonitor the processes in containers which are disabled in `FEATURE` table such that Monit will not generate false alerting messages into the syslog. Signed-off-by: Yong Zhao --- .../base_image_files/monit_database | 4 +- .../docker-fpm-frr/base_image_files/monit_bgp | 24 ++++---- .../docker-lldp/base_image_files/monit_lldp | 12 ++-- .../base_image_files/monit_swss | 42 +++++++------- .../docker-sflow/base_image_files/monit_sflow | 4 +- .../docker-snmp/base_image_files/monit_snmp | 8 +-- .../base_image_files/monit_restapi | 4 +- .../base_image_files/monit_telemetry | 8 +-- .../docker-teamd/base_image_files/monit_teamd | 11 ++++ .../build_templates/sonic_debian_extension.j2 | 5 ++ files/image_config/monit/process_checker | 57 +++++++++++++++++++ .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- rules/docker-teamd.mk | 1 + 21 files changed, 149 insertions(+), 75 deletions(-) create mode 100644 dockers/docker-teamd/base_image_files/monit_teamd create mode 100755 files/image_config/monit/process_checker diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c550892286..c1addd8a6f 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check process redis_server matching "/usr/bin/redis-server" - if does not exist for 5 times within 5 cycles then alert +check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index b9726c6195..46ddc62901 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -8,20 +8,20 @@ ## bgpcfgd ## bgpmon ############################################################################### -check process zebra matching "/usr/lib/frr/zebra" - if does not exist for 5 times within 5 cycles then alert +check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra" + if status != 0 for 5 times within 5 cycles then alert -check process fpmsyncd matching "fpmsyncd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpd matching "/usr/lib/frr/bgpd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd" + if status != 0 for 5 times within 5 cycles then alert -check process staticd matching "/usr/lib/frr/staticd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpcfgd matching "python /usr/local/bin/bgpcfgd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpmon matching "python /usr/local/bin/bgpmon" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpmon with path "/usr/bin/process_checker bgp python /usr/local/bin/bgpmon" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 200c52c7d3..194fa14a30 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check process lldpd_monitor matching "lldpd: " - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:" + if status != 0 for 5 times within 5 cycles then alert -check process lldp_syncd matching "python2 -m lldp_syncd" - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd" + if status != 0 for 5 times within 5 cycles then alert -check process lldpmgrd matching "python /usr/bin/lldpmgrd" - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index 5928dbd4dd..f5f4389f3f 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -11,33 +11,33 @@ ## buffermgrd ## nbrmgrd ## vxlanmgrd -############################################################################### -check process orchagent matching "/usr/bin/orchagent -d /var/log/swss" - if does not exist for 5 times within 5 cycles then alert +############################################################################## +check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss" + if status != 0 for 5 times within 5 cycles then alert -check process portsyncd matching "/usr/bin/portsyncd" - if does not exist for 5 times within 5 cycles then alert +check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process neighsyncd matching "/usr/bin/neighsyncd" - if does not exist for 5 times within 5 cycles then alert +check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process vrfmgrd matching "/usr/bin/vrfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vlanmgrd matching "/usr/bin/vlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process intfmgrd matching "/usr/bin/intfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process portmgrd matching "/usr/bin/portmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process buffermgrd matching "/usr/bin/buffermgrd -l" - if does not exist for 5 times within 5 cycles then alert +check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l" + if status != 0 for 5 times within 5 cycles then alert -check process nbrmgrd matching "/usr/bin/nbrmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vxlanmgrd matching "/usr/bin/vxlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index d041f81001..217f2e6258 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check process sflowmgrd matching "/usr/bin/sflowmgrd" - if does not exist for 5 times within 5 cycles then alert +check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index cfb1a2b668..a943985abc 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check process snmpd matching "/usr/sbin/snmpd\s" - if does not exist for 5 times within 5 cycles then alert +check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd" + if status != 0 for 5 times within 5 cycles then alert -check process snmp_subagent matching "python3 -m sonic_ax_impl" - if does not exist for 5 times within 5 cycles then alert +check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3.6 -m sonic_ax_impl" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 2e90baf30d..84e4366f4a 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check process restapi matching "/usr/sbin/go-server-server" - if does not exist for 5 times within 5 cycles then alert +check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index a82c652f81..7365ce51d1 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check process telemetry matching "/usr/sbin/telemetry" - if does not exist for 5 times within 5 cycles then alert +check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry" + if status != 0 for 5 times within 5 cycles then alert -check process dialout_client matching "/usr/sbin/dialout_client_cli" - if does not exist for 5 times within 5 cycles then alert +check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd new file mode 100644 index 0000000000..256482aef2 --- /dev/null +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -0,0 +1,11 @@ +############################################################################### +## Monit configuration for teamd container +## process list: +## teamsyncd +## teammgrd +############################################################################### +check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd" + if status != 0 for 5 times within 5 cycles then alert + +check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 2fbd6687d8..36b521af82 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -116,6 +116,9 @@ sudo rm -rf $FILESYSTEM_ROOT/$REDIS_DUMP_LOAD_PY2_WHEEL_NAME # Install Python module for ipaddress sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install ipaddress +# Install Python module for psutil +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install psutil + # Install SwSS SDK Python 3 package # Note: the scripts will be overwritten by corresponding Python 2 package if [ -e {{swsssdk_py3_wheel_path}} ]; then @@ -239,6 +242,8 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* +sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker # Copy crontabs sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/ diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker new file mode 100755 index 0000000000..ba48e37729 --- /dev/null +++ b/files/image_config/monit/process_checker @@ -0,0 +1,57 @@ +#!/usr/bin/python +import argparse +import sys +import syslog + +import psutil +import swsssdk + + +def check_process_existence(container_name, process_cmdline): + """ + @summary: Check whether the process in the specified container is running or not and + an alerting message will written into syslog if it failed to run. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + + if container_name in feature_table.keys(): + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[container_name].keys() + and feature_table[container_name]["state"] == "disabled"): + sys.exit(0) + else: + # We leveraged the psutil library to help us check whether the process is running or not. + # If the process entity is found in process tree and it is also in the 'running' or 'sleeping' + # state, then it will be marked as 'running'. + is_running = False + for process in psutil.process_iter(["cmdline", "status"]): + if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]): + is_running = True + break + + if not is_running: + # If this script is run by Monit, then the following output will be appended to + # Monit's syslog message. + print("'{}' is not running.".format(process_cmdline)) + sys.exit(1) + else: + syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!" + .format(container_name)) + + +def main(): + parser = argparse.ArgumentParser(description="Check whether the process in the specified \ + container is running and an alerting message will be written into syslog if it \ + failed to run.", usage="/usr/bin/process_checker ") + parser.add_argument("container_name", help="container name") + parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process command line") + args = parser.parse_args() + + check_process_existence(args.container_name, ' '.join(args.process_cmdline)) + + +if __name__ == '__main__': + main() diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 81c0b6ef6b..1195487700 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 81c0b6ef6b..1195487700 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/rules/docker-teamd.mk b/rules/docker-teamd.mk index c0fe6bfb6b..5442d5bf6b 100644 --- a/rules/docker-teamd.mk +++ b/rules/docker-teamd.mk @@ -27,4 +27,5 @@ $(DOCKER_TEAMD)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_TEAMD)_RUN_OPT += -v /host/warmboot:/var/warmboot $(DOCKER_TEAMD)_BASE_IMAGE_FILES += teamdctl:/usr/bin/teamdctl +$(DOCKER_TEAMD)_BASE_IMAGE_FILES += monit_teamd:/etc/monit/conf.d $(DOCKER_TEAMD)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)