[201911][Monit] Monitor critical processes in PMon contianer. (#7438)

Signed-off-by: Yong Zhao yozhao@microsoft.com

Why I did it
This PR aims to monitor the critical processes in PMon container by Monit in 201911 branch.

How I did it
I created a template configuration file of Monit and it will be rendered to generate Monit configuration file of PMon container
by a service generate_monit_config.service.

How to verify it
I verified this on a Mellanox device str-msn2700-03 and an Arista device str-a7050-acs-1.

Which release branch to backport (provide reason below if selected)
 201811
[x ] 201911
 202006
 202012
This commit is contained in:
yozhao101 2021-04-28 17:12:21 -07:00 committed by GitHub
parent 80f0836643
commit a8d2d0b5cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 92 additions and 3 deletions

View File

@ -0,0 +1,61 @@
{# This template file is used to generate Monit configuration file of platform monitor container -#}
###############################################################################
## Monit configuration file for PMon container
## process list:
{% if not skip_fancontrol and HAVE_FANCONTROL_CONF == 1 %}
## fancontrol
{% endif %}
{% if not skip_ledd %}
## ledd
{% endif %}
{% if not skip_psud %}
## psud
{% endif %}
{% if not skip_sensors and HAVE_SENSORS_CONF == 1 %}
## sensord
{% endif %}
{% if not skip_syseepromd %}
## syseepromd
{% endif %}
{% if not skip_thermalctld %}
## thermalctld
{% endif %}
{% if not skip_xcvrd %}
## xcvrd
{% endif %}
###############################################################################
{% if not skip_fancontrol and HAVE_FANCONTROL_CONF == 1 %}
check program pmon|fancontrol with path "/usr/bin/process_checker pmon /bin/bash /usr/sbin/fancontrol"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_ledd %}
check program pmon|ledd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/ledd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_psud %}
check program pmon|psud with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/psud"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_sensors and HAVE_SENSORS_CONF == 1 %}
check program pmon|sensord with path "/usr/bin/process_checker pmon /usr/sbin/sensord -f daemon"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_syseepromd %}
check program pmon|syseepromd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/syseepromd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_thermalctld %}
check program pmon|thermalctld with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/thermalctld"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{% endif %}
{% if not skip_xcvrd %}
check program pmon|xcvrd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/xcvrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
{%- endif -%}

33
files/image_config/monit/generate_monit_config Normal file → Executable file
View File

@ -1,12 +1,39 @@
#!/bin/bash
# Generate the following files from templates:
# 1. Monit configuration file of radv container
# 2. Monit configuration file of dhcp_relay container
# 1. Monit configuration file of dhcp_relay container
# 2. Monit configuration file of PMon container
# 3. Monit configuration file of radv container
CFGGEN_PARAMS=" \
-d \
-t /usr/share/sonic/templates/monit_radv.j2,/etc/monit/conf.d/monit_radv \
-t /usr/share/sonic/templates/monit_dhcp_relay.j2,/etc/monit/conf.d/monit_dhcp_relay \
-t /usr/share/sonic/templates/monit_radv.j2,/etc/monit/conf.d/monit_radv \
"
sonic-cfggen $CFGGEN_PARAMS
PLATFORM=$(sonic-cfggen -d -v DEVICE_METADATA.localhost.platform 2> /dev/null)
if [[ $? == 0 && $PLATFORM != "" ]]; then
SENSORS_CONF_FILE="/usr/share/sonic/device/$PLATFORM/sensors.conf"
FANCONTROL_CONF_FILE="/usr/share/sonic/device/$PLATFORM/fancontrol"
HAVE_SENSORS_CONF=0
HAVE_FANCONTROL_CONF=0
if [ -e $SENSORS_CONF_FILE ]; then
HAVE_SENSORS_CONF=1
fi
if [ -e $FANCONTROL_CONF_FILE ]; then
HAVE_FANCONTROL_CONF=1
fi
confvar="{\"HAVE_SENSORS_CONF\":$HAVE_SENSORS_CONF, \"HAVE_FANCONTROL_CONF\":$HAVE_FANCONTROL_CONF}"
if [ -e /usr/share/sonic/device/$PLATFORM/pmon_daemon_control.json ]; then
sonic-cfggen -j /usr/share/sonic/device/$PLATFORM/pmon_daemon_control.json -a "$confvar" -t /usr/share/sonic/template/monit_pmon.j2,/etc/monit/conf.d/monit_pmon
else
sonic-cfggen -a "$confvar" -t /usr/share/sonic/template/monit_pmon.j2,/etc/monit/conf.d/monit_pmon
fi
fi

View File

@ -46,4 +46,5 @@ $(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/bin/sensors
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/smartctl
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/iSmart
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/SmartCmd
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += monit_pmon.j2:/usr/share/sonic/templates
$(DOCKER_PLATFORM_MONITOR)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)