diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c550892286..c1addd8a6f 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check process redis_server matching "/usr/bin/redis-server" - if does not exist for 5 times within 5 cycles then alert +check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 7a70ef97db..46ddc62901 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -8,20 +8,20 @@ ## bgpcfgd ## bgpmon ############################################################################### -check process zebra matching "/usr/lib/frr/zebra" - if does not exist for 5 times within 5 cycles then alert +check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra" + if status != 0 for 5 times within 5 cycles then alert -check process fpmsyncd matching "fpmsyncd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpd matching "/usr/lib/frr/bgpd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd" + if status != 0 for 5 times within 5 cycles then alert -check process staticd matching "/usr/lib/frr/staticd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpcfgd matching "python /usr/local/bin/bgpcfgd" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpcfgd matching "python /usr/local/bin/bgpmon" - if does not exist for 5 times within 5 cycles then alert +check program bgp|bgpmon with path "/usr/bin/process_checker bgp python /usr/local/bin/bgpmon" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp-sv2/base_image_files/monit_lldp b/dockers/docker-lldp-sv2/base_image_files/monit_lldp index 200c52c7d3..194fa14a30 100644 --- a/dockers/docker-lldp-sv2/base_image_files/monit_lldp +++ b/dockers/docker-lldp-sv2/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check process lldpd_monitor matching "lldpd: " - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:" + if status != 0 for 5 times within 5 cycles then alert -check process lldp_syncd matching "python2 -m lldp_syncd" - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd" + if status != 0 for 5 times within 5 cycles then alert -check process lldpmgrd matching "python /usr/bin/lldpmgrd" - if does not exist for 5 times within 5 cycles then alert +check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index 5928dbd4dd..f5f4389f3f 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -11,33 +11,33 @@ ## buffermgrd ## nbrmgrd ## vxlanmgrd -############################################################################### -check process orchagent matching "/usr/bin/orchagent -d /var/log/swss" - if does not exist for 5 times within 5 cycles then alert +############################################################################## +check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss" + if status != 0 for 5 times within 5 cycles then alert -check process portsyncd matching "/usr/bin/portsyncd" - if does not exist for 5 times within 5 cycles then alert +check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process neighsyncd matching "/usr/bin/neighsyncd" - if does not exist for 5 times within 5 cycles then alert +check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process vrfmgrd matching "/usr/bin/vrfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vlanmgrd matching "/usr/bin/vlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process intfmgrd matching "/usr/bin/intfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process portmgrd matching "/usr/bin/portmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process buffermgrd matching "/usr/bin/buffermgrd -l" - if does not exist for 5 times within 5 cycles then alert +check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l" + if status != 0 for 5 times within 5 cycles then alert -check process nbrmgrd matching "/usr/bin/nbrmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vxlanmgrd matching "/usr/bin/vxlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index d041f81001..217f2e6258 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check process sflowmgrd matching "/usr/bin/sflowmgrd" - if does not exist for 5 times within 5 cycles then alert +check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp-sv2/base_image_files/monit_snmp b/dockers/docker-snmp-sv2/base_image_files/monit_snmp index 811f9d14b3..a943985abc 100644 --- a/dockers/docker-snmp-sv2/base_image_files/monit_snmp +++ b/dockers/docker-snmp-sv2/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check process snmpd matching "/usr/sbin/snmpd -f" - if does not exist for 5 times within 5 cycles then alert +check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd" + if status != 0 for 5 times within 5 cycles then alert -check process snmp_subagent matching "python3.6 -m sonic_ax_impl" - if does not exist for 5 times within 5 cycles then alert +check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3.6 -m sonic_ax_impl" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 2e90baf30d..84e4366f4a 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check process restapi matching "/usr/sbin/go-server-server" - if does not exist for 5 times within 5 cycles then alert +check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index a82c652f81..7365ce51d1 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check process telemetry matching "/usr/sbin/telemetry" - if does not exist for 5 times within 5 cycles then alert +check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry" + if status != 0 for 5 times within 5 cycles then alert -check process dialout_client matching "/usr/sbin/dialout_client_cli" - if does not exist for 5 times within 5 cycles then alert +check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd new file mode 100644 index 0000000000..256482aef2 --- /dev/null +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -0,0 +1,11 @@ +############################################################################### +## Monit configuration for teamd container +## process list: +## teamsyncd +## teammgrd +############################################################################### +check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd" + if status != 0 for 5 times within 5 cycles then alert + +check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 600221b18c..c05c88961d 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -109,6 +109,9 @@ sudo rm -rf $FILESYSTEM_ROOT/$REDIS_DUMP_LOAD_PY2_WHEEL_NAME # Install Python module for ipaddress sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install ipaddress +# Install Python module for psutil +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install psutil + # Install SwSS SDK Python 2 package SWSSSDK_PY2_WHEEL_NAME=$(basename {{swsssdk_py2_wheel_path}}) sudo cp {{swsssdk_py2_wheel_path}} $FILESYSTEM_ROOT/$SWSSSDK_PY2_WHEEL_NAME @@ -197,6 +200,8 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* +sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker # Copy crontabs sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/ diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker new file mode 100755 index 0000000000..ba48e37729 --- /dev/null +++ b/files/image_config/monit/process_checker @@ -0,0 +1,57 @@ +#!/usr/bin/python +import argparse +import sys +import syslog + +import psutil +import swsssdk + + +def check_process_existence(container_name, process_cmdline): + """ + @summary: Check whether the process in the specified container is running or not and + an alerting message will written into syslog if it failed to run. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + + if container_name in feature_table.keys(): + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[container_name].keys() + and feature_table[container_name]["state"] == "disabled"): + sys.exit(0) + else: + # We leveraged the psutil library to help us check whether the process is running or not. + # If the process entity is found in process tree and it is also in the 'running' or 'sleeping' + # state, then it will be marked as 'running'. + is_running = False + for process in psutil.process_iter(["cmdline", "status"]): + if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]): + is_running = True + break + + if not is_running: + # If this script is run by Monit, then the following output will be appended to + # Monit's syslog message. + print("'{}' is not running.".format(process_cmdline)) + sys.exit(1) + else: + syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!" + .format(container_name)) + + +def main(): + parser = argparse.ArgumentParser(description="Check whether the process in the specified \ + container is running and an alerting message will be written into syslog if it \ + failed to run.", usage="/usr/bin/process_checker ") + parser.add_argument("container_name", help="container name") + parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process command line") + args = parser.parse_args() + + check_process_existence(args.container_name, ' '.join(args.process_cmdline)) + + +if __name__ == '__main__': + main() diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 81c0b6ef6b..1195487700 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 75391f90ac..14789c67c3 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 81c0b6ef6b..1195487700 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/rules/docker-teamd.mk b/rules/docker-teamd.mk index ce7b5bbab1..083a562c6c 100644 --- a/rules/docker-teamd.mk +++ b/rules/docker-teamd.mk @@ -29,4 +29,5 @@ $(DOCKER_TEAMD)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_TEAMD)_RUN_OPT += -v /host/warmboot:/var/warmboot $(DOCKER_TEAMD)_BASE_IMAGE_FILES += teamdctl:/usr/bin/teamdctl +$(DOCKER_TEAMD)_BASE_IMAGE_FILES += monit_teamd:/etc/monit/conf.d $(DOCKER_TEAMD)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)