[201911][Monit] Monitor critical processes in PMon contianer. (#7438)
Signed-off-by: Yong Zhao yozhao@microsoft.com Why I did it This PR aims to monitor the critical processes in PMon container by Monit in 201911 branch. How I did it I created a template configuration file of Monit and it will be rendered to generate Monit configuration file of PMon container by a service generate_monit_config.service. How to verify it I verified this on a Mellanox device str-msn2700-03 and an Arista device str-a7050-acs-1. Which release branch to backport (provide reason below if selected) 201811 [x ] 201911 202006 202012
This commit is contained in:
parent
80f0836643
commit
a8d2d0b5cd
@ -0,0 +1,61 @@
|
||||
{# This template file is used to generate Monit configuration file of platform monitor container -#}
|
||||
|
||||
###############################################################################
|
||||
## Monit configuration file for PMon container
|
||||
## process list:
|
||||
{% if not skip_fancontrol and HAVE_FANCONTROL_CONF == 1 %}
|
||||
## fancontrol
|
||||
{% endif %}
|
||||
{% if not skip_ledd %}
|
||||
## ledd
|
||||
{% endif %}
|
||||
{% if not skip_psud %}
|
||||
## psud
|
||||
{% endif %}
|
||||
{% if not skip_sensors and HAVE_SENSORS_CONF == 1 %}
|
||||
## sensord
|
||||
{% endif %}
|
||||
{% if not skip_syseepromd %}
|
||||
## syseepromd
|
||||
{% endif %}
|
||||
{% if not skip_thermalctld %}
|
||||
## thermalctld
|
||||
{% endif %}
|
||||
{% if not skip_xcvrd %}
|
||||
## xcvrd
|
||||
{% endif %}
|
||||
###############################################################################
|
||||
{% if not skip_fancontrol and HAVE_FANCONTROL_CONF == 1 %}
|
||||
check program pmon|fancontrol with path "/usr/bin/process_checker pmon /bin/bash /usr/sbin/fancontrol"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_ledd %}
|
||||
check program pmon|ledd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/ledd"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_psud %}
|
||||
check program pmon|psud with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/psud"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_sensors and HAVE_SENSORS_CONF == 1 %}
|
||||
check program pmon|sensord with path "/usr/bin/process_checker pmon /usr/sbin/sensord -f daemon"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_syseepromd %}
|
||||
check program pmon|syseepromd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/syseepromd"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_thermalctld %}
|
||||
check program pmon|thermalctld with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/thermalctld"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{% endif %}
|
||||
|
||||
{% if not skip_xcvrd %}
|
||||
check program pmon|xcvrd with path "/usr/bin/process_checker pmon /usr/bin/python /usr/bin/xcvrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
{%- endif -%}
|
33
files/image_config/monit/generate_monit_config
Normal file → Executable file
33
files/image_config/monit/generate_monit_config
Normal file → Executable file
@ -1,12 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Generate the following files from templates:
|
||||
# 1. Monit configuration file of radv container
|
||||
# 2. Monit configuration file of dhcp_relay container
|
||||
# 1. Monit configuration file of dhcp_relay container
|
||||
# 2. Monit configuration file of PMon container
|
||||
# 3. Monit configuration file of radv container
|
||||
|
||||
CFGGEN_PARAMS=" \
|
||||
-d \
|
||||
-t /usr/share/sonic/templates/monit_radv.j2,/etc/monit/conf.d/monit_radv \
|
||||
-t /usr/share/sonic/templates/monit_dhcp_relay.j2,/etc/monit/conf.d/monit_dhcp_relay \
|
||||
-t /usr/share/sonic/templates/monit_radv.j2,/etc/monit/conf.d/monit_radv \
|
||||
"
|
||||
sonic-cfggen $CFGGEN_PARAMS
|
||||
|
||||
PLATFORM=$(sonic-cfggen -d -v DEVICE_METADATA.localhost.platform 2> /dev/null)
|
||||
|
||||
if [[ $? == 0 && $PLATFORM != "" ]]; then
|
||||
SENSORS_CONF_FILE="/usr/share/sonic/device/$PLATFORM/sensors.conf"
|
||||
FANCONTROL_CONF_FILE="/usr/share/sonic/device/$PLATFORM/fancontrol"
|
||||
|
||||
HAVE_SENSORS_CONF=0
|
||||
HAVE_FANCONTROL_CONF=0
|
||||
|
||||
if [ -e $SENSORS_CONF_FILE ]; then
|
||||
HAVE_SENSORS_CONF=1
|
||||
fi
|
||||
|
||||
if [ -e $FANCONTROL_CONF_FILE ]; then
|
||||
HAVE_FANCONTROL_CONF=1
|
||||
fi
|
||||
|
||||
confvar="{\"HAVE_SENSORS_CONF\":$HAVE_SENSORS_CONF, \"HAVE_FANCONTROL_CONF\":$HAVE_FANCONTROL_CONF}"
|
||||
|
||||
if [ -e /usr/share/sonic/device/$PLATFORM/pmon_daemon_control.json ]; then
|
||||
sonic-cfggen -j /usr/share/sonic/device/$PLATFORM/pmon_daemon_control.json -a "$confvar" -t /usr/share/sonic/template/monit_pmon.j2,/etc/monit/conf.d/monit_pmon
|
||||
else
|
||||
sonic-cfggen -a "$confvar" -t /usr/share/sonic/template/monit_pmon.j2,/etc/monit/conf.d/monit_pmon
|
||||
fi
|
||||
fi
|
||||
|
@ -46,4 +46,5 @@ $(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/bin/sensors
|
||||
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/smartctl
|
||||
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/iSmart
|
||||
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += cmd_wrapper:/usr/sbin/SmartCmd
|
||||
$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += monit_pmon.j2:/usr/share/sonic/templates
|
||||
$(DOCKER_PLATFORM_MONITOR)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
|
||||
|
Reference in New Issue
Block a user