cc9c3f567e
- Why I did it Initially, we used Monit to monitor critical processes in each container. If one of critical processes was not running or crashed due to some reasons, then Monit will write an alerting message into syslog periodically. If we add a new process in a container, the corresponding Monti configuration file will also need to update. It is a little hard for maintenance. Currently we employed event listener of Supervisod to do this monitoring. Since processes in each container are managed by Supervisord, we can only focus on the logic of monitoring. - How I did it We borrowed the event listener of Supervisord to monitor critical processes in containers. The event listener will take following steps if it was notified one of critical processes exited unexpectedly: The event listener will first check whether the auto-restart mechanism was enabled for this container or not. If auto-restart mechanism was enabled, event listener will kill the Supervisord process, which should cause the container to exit and subsequently get restarted. If auto-restart mechanism was not enabled for this contianer, the event listener will enter a loop which will first sleep 1 minute and then check whether the process is running. If yes, the event listener exits. If no, an alerting message will be written into syslog. - How to verify it First, we need checked whether the auto-restart mechanism of a container was enabled or not by running the command show feature status. If enabled, one critical process should be selected and killed manually, then we need check whether the container will be restarted or not. Second, we can disable the auto-restart mechanism if it was enabled at step 1 by running the commnad sudo config feature autorestart <container_name> disabled. Then one critical process should be selected and killed. After that, we will see the alerting message which will appear in the syslog every 1 minute. - Which release branch to backport (provide reason below if selected) 201811 201911 [x ] 202006
162 lines
5.1 KiB
Django/Jinja
162 lines
5.1 KiB
Django/Jinja
[supervisord]
|
|
logfile_maxbytes=1MB
|
|
logfile_backups=2
|
|
nodaemon=true
|
|
|
|
[eventlistener:dependent-startup]
|
|
command=python3 -m supervisord_dependent_startup
|
|
autostart=true
|
|
autorestart=unexpected
|
|
startretries=0
|
|
exitcodes=0,3
|
|
events=PROCESS_STATE
|
|
buffer_size=50
|
|
|
|
[eventlistener:supervisor-proc-exit-listener]
|
|
command=/usr/bin/supervisor-proc-exit-listener --container-name dhcp_relay
|
|
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
|
|
autostart=true
|
|
autorestart=unexpected
|
|
|
|
[program:rsyslogd]
|
|
command=/usr/sbin/rsyslogd -n -iNONE
|
|
priority=1
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
|
|
[program:start]
|
|
command=/usr/bin/start.sh
|
|
priority=2
|
|
autostart=false
|
|
autorestart=false
|
|
startsecs=0
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=rsyslogd:running
|
|
|
|
{# If our configuration has VLANs... #}
|
|
{% if VLAN_INTERFACE %}
|
|
{# Count how many VLANs require a DHCP relay agent... #}
|
|
{% set num_relays = { 'count': 0 } %}
|
|
{% for vlan_name in VLAN_INTERFACE %}
|
|
{% if VLAN and vlan_name in VLAN and VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% set _dummy = num_relays.update({'count': num_relays.count + 1}) %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{# If one or more of the VLANs require a DHCP relay agent... #}
|
|
{% if num_relays.count > 0 %}
|
|
[group:isc-dhcp-relay]
|
|
programs=
|
|
{%- set add_preceding_comma = { 'flag': False } %}
|
|
{% for vlan_name in VLAN_INTERFACE %}
|
|
{% if VLAN and vlan_name in VLAN and VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% if add_preceding_comma.flag %},{% endif %}
|
|
{% set _dummy = add_preceding_comma.update({'flag': True}) %}
|
|
isc-dhcp-relay-{{ vlan_name }}
|
|
{%- endif %}
|
|
{% endfor %}
|
|
|
|
|
|
{# Create a program entry for each DHCP relay agent instance #}
|
|
{% set relay_for_ipv4 = { 'flag': False } %}
|
|
{% for vlan_name in VLAN_INTERFACE %}
|
|
{% if VLAN and vlan_name in VLAN and VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% for dhcp_server in VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% if dhcp_server | ipv4 %}
|
|
{% set _dummy = relay_for_ipv4.update({'flag': True}) %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if relay_for_ipv4.flag %}
|
|
{% set _dummy = relay_for_ipv4.update({'flag': False}) %}
|
|
[program:isc-dhcp-relay-{{ vlan_name }}]
|
|
{# We treat this VLAN as a downstream interface (-id), as we only want to listen for requests #}
|
|
command=/usr/sbin/dhcrelay -d -m discard -a %%h:%%p %%P --name-alias-map-file /tmp/port-name-alias-map.txt -id {{ vlan_name }}
|
|
{#- Dual ToR Option #}
|
|
{% if 'subtype' in DEVICE_METADATA['localhost'] and DEVICE_METADATA['localhost']['subtype'] == 'DualToR' %} -U Loopback0 -dt{% endif -%}
|
|
{#- We treat all other interfaces as upstream interfaces (-iu), as we only want to listen for replies #}
|
|
{% for (name, prefix) in VLAN_INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 and name != vlan_name %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% for (name, prefix) in INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% for (name, prefix) in PORTCHANNEL_INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% for dhcp_server in VLAN[vlan_name]['dhcp_servers'] %}
|
|
{%- if dhcp_server | ipv4 %} {{ dhcp_server }}{% endif -%}
|
|
{% endfor %}
|
|
|
|
priority=3
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=start:exited
|
|
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
|
|
[group:dhcpmon]
|
|
programs=
|
|
{%- set add_preceding_comma = { 'flag': False } %}
|
|
{% for vlan_name in VLAN_INTERFACE %}
|
|
{% if VLAN and vlan_name in VLAN and VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% if add_preceding_comma.flag %},{% endif %}
|
|
{% set _dummy = add_preceding_comma.update({'flag': True}) %}
|
|
dhcpmon-{{ vlan_name }}
|
|
{%- endif %}
|
|
{% endfor %}
|
|
|
|
|
|
{# Create a program entry for each DHCP MONitor instance #}
|
|
{% set relay_for_ipv4 = { 'flag': False } %}
|
|
{% for vlan_name in VLAN_INTERFACE %}
|
|
{% if VLAN and vlan_name in VLAN and VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% for dhcp_server in VLAN[vlan_name]['dhcp_servers'] %}
|
|
{% if dhcp_server | ipv4 %}
|
|
{% set _dummy = relay_for_ipv4.update({'flag': True}) %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if relay_for_ipv4.flag %}
|
|
{% set _dummy = relay_for_ipv4.update({'flag': False}) %}
|
|
[program:dhcpmon-{{ vlan_name }}]
|
|
{# We treat this VLAN as a downstream interface (-id), as we only want to listen for requests #}
|
|
command=/usr/sbin/dhcpmon -id {{ vlan_name }}
|
|
{#- We treat all other interfaces as upstream interfaces (-iu), as we only want to listen for replies #}
|
|
{% for (name, prefix) in VLAN_INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 and name != vlan_name %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% for (name, prefix) in INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% for (name, prefix) in PORTCHANNEL_INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 %} -iu {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% if MGMT_INTERFACE %}
|
|
{% for (name, prefix) in MGMT_INTERFACE|pfx_filter %}
|
|
{% if prefix | ipv4 %} -im {{ name }}{% endif -%}
|
|
{% endfor %}
|
|
{% endif %}
|
|
|
|
priority=4
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=isc-dhcp-relay-{{ vlan_name }}:running
|
|
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
|
|
{% endif %}
|
|
{% endif %}
|