[services] Restart SwSS service upon unexpected critical process exit (#2845)
* [service] Restart SwSS Docker container if orchagent exits unexpectedly * Configure systemd to stop restarting swss if it attempts to restart more than 3 times in 20 minutes * Move supervisor-proc-exit-listener script * [docker-dhcp-relay] Enhance wait_for_intf.sh.j2 to utilize STATEDB * Ensure dependent services stop/start/restart with SwSS * Change 'StartLimitInterval' to 'StartLimitIntervalSec', as Stretch installs systemd 232 (>= v230) * Also update journald.conf options * Remove 'PartOf' option from unit files * Add '$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)' to new shared docker-orchagent makefile * Make supervisor-proc-exit-listener script read from 'critical_processes' file inside container * Update critical_processes file for swss container
This commit is contained in:
parent
2736da97c7
commit
6eca27e564
@ -1,42 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
STATE_DB_IDX="6"
|
||||
|
||||
PORT_TABLE_PREFIX="PORT_TABLE"
|
||||
VLAN_TABLE_PREFIX="VLAN_TABLE"
|
||||
LAG_TABLE_PREFIX="LAG_TABLE"
|
||||
|
||||
function wait_until_iface_ready
|
||||
{
|
||||
IFACE=$1
|
||||
TABLE_PREFIX=$1
|
||||
IFACE=$2
|
||||
|
||||
echo "Waiting until interface $IFACE is up..."
|
||||
echo "Waiting until interface $IFACE is ready..."
|
||||
|
||||
# Wait for the interface to come up (i.e., 'ip link show' returns 0)
|
||||
until ip link show dev $IFACE up > /dev/null 2>&1; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Interface $IFACE is up"
|
||||
|
||||
echo "Waiting until interface $IFACE has an IPv4 address..."
|
||||
|
||||
# Wait until the interface gets assigned an IPv4 address
|
||||
# Wait for the interface to come up
|
||||
# (i.e., interface is present in STATE_DB and state is "ok")
|
||||
while true; do
|
||||
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
|
||||
|
||||
if [ -n "$IP" ]; then
|
||||
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
|
||||
if [ x"$RESULT" == x"ok" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Interface $IFACE is configured with IP $IP"
|
||||
echo "Interface ${IFACE} is ready!"
|
||||
}
|
||||
|
||||
|
||||
# Wait for all interfaces to come up and have IPv4 addresses assigned
|
||||
# Wait for all interfaces to be up and ready
|
||||
{% for (name, prefix) in INTERFACE %}
|
||||
wait_until_iface_ready {{ name }}
|
||||
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }}
|
||||
{% endfor %}
|
||||
{% for (name, prefix) in VLAN_INTERFACE %}
|
||||
wait_until_iface_ready {{ name }}
|
||||
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }}
|
||||
{% endfor %}
|
||||
{% for (name, prefix) in PORTCHANNEL_INTERFACE %}
|
||||
wait_until_iface_ready {{ name }}
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }}
|
||||
{% endfor %}
|
||||
|
@ -41,6 +41,8 @@ COPY ["files/arp_update", "/usr/bin"]
|
||||
COPY ["enable_counters.py", "/usr/bin"]
|
||||
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
|
||||
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
|
||||
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
|
||||
COPY ["critical_processes", "/etc/supervisor/"]
|
||||
|
||||
## Copy all Jinja2 template files into the templates folder
|
||||
COPY ["*.j2", "/usr/share/sonic/templates/"]
|
||||
|
11
dockers/docker-orchagent/critical_processes
Normal file
11
dockers/docker-orchagent/critical_processes
Normal file
@ -0,0 +1,11 @@
|
||||
orchagent
|
||||
portsyncd
|
||||
neighsyncd
|
||||
vlanmgrd
|
||||
intfmgrd
|
||||
portmgrd
|
||||
buffermgrd
|
||||
vrfmgrd
|
||||
nbrmgrd
|
||||
vxlanmgrd
|
||||
intfsyncd
|
@ -3,6 +3,12 @@ logfile_maxbytes=1MB
|
||||
logfile_backups=2
|
||||
nodaemon=true
|
||||
|
||||
[eventlistener:supervisor-proc-exit-listener]
|
||||
command=/usr/bin/supervisor-proc-exit-listener
|
||||
events=PROCESS_STATE_EXITED
|
||||
autostart=true
|
||||
autorestart=unexpected
|
||||
|
||||
[program:start.sh]
|
||||
command=/usr/bin/start.sh
|
||||
priority=1
|
||||
@ -15,7 +21,7 @@ stderr_logfile=syslog
|
||||
command=/usr/sbin/rsyslogd -n
|
||||
priority=2
|
||||
autostart=false
|
||||
autorestart=false
|
||||
autorestart=unexpected
|
||||
stdout_logfile=syslog
|
||||
stderr_logfile=syslog
|
||||
|
||||
|
@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait
|
||||
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target teamd.service
|
||||
WantedBy=multi-user.target swss.service teamd.service
|
||||
|
@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait
|
||||
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
WantedBy=multi-user.target swss.service
|
||||
|
@ -9,3 +9,6 @@ Before=ntp-config.service
|
||||
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
|
||||
ExecStart=/usr/bin/{{docker_container_name}}.sh wait
|
||||
ExecStop=/usr/bin/{{docker_container_name}}.sh stop
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target swss.service
|
||||
|
@ -9,6 +9,8 @@ Requires=nps-modules-4.9.0-8-2-amd64.service
|
||||
After=database.service updategraph.service
|
||||
After=interfaces-config.service
|
||||
Before=ntp-config.service
|
||||
StartLimitIntervalSec=1200
|
||||
StartLimitBurst=3
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
@ -16,6 +18,8 @@ Environment=sonic_asic_platform={{ sonic_asic_platform }}
|
||||
ExecStartPre=/usr/local/bin/swss.sh start
|
||||
ExecStart=/usr/local/bin/swss.sh wait
|
||||
ExecStop=/usr/local/bin/swss.sh stop
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=TEAMD container
|
||||
Requires=updategraph.service
|
||||
Requires=updategraph.service swss.service
|
||||
After=updategraph.service swss.service
|
||||
Before=ntp-config.service
|
||||
|
||||
@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh wait
|
||||
ExecStop=/usr/bin/{{docker_container_name}}.sh stop
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
WantedBy=multi-user.target swss.service
|
||||
|
@ -13,7 +13,7 @@
|
||||
#Seal=yes
|
||||
#SplitMode=uid
|
||||
#SyncIntervalSec=5m
|
||||
#RateLimitInterval=30s
|
||||
#RateLimitIntervalSec=30s
|
||||
#RateLimitBurst=1000
|
||||
SystemMaxUse=50M
|
||||
#SystemKeepFree=
|
||||
|
45
files/scripts/supervisor-proc-exit-listener
Executable file
45
files/scripts/supervisor-proc-exit-listener
Executable file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import syslog
|
||||
|
||||
from supervisor import childutils
|
||||
|
||||
# Contents of file should be the names of critical processes (as defined in
|
||||
# supervisor.conf file), one per line
|
||||
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
|
||||
|
||||
def main():
|
||||
# Read the list of critical processes from a file
|
||||
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
|
||||
critical_processes = [line.rstrip('\n') for line in f]
|
||||
|
||||
while True:
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
childutils.listener.ready()
|
||||
|
||||
line = sys.stdin.readline()
|
||||
headers = childutils.get_headers(line)
|
||||
payload = sys.stdin.read(int(headers['len']))
|
||||
|
||||
# Transition from READY to ACKNOWLEDGED
|
||||
childutils.listener.ok()
|
||||
|
||||
# We only care about PROCESS_STATE_EXITED events
|
||||
if headers['eventname'] == 'PROCESS_STATE_EXITED':
|
||||
payload_headers, payload_data = childutils.eventdata(payload + '\n')
|
||||
|
||||
expected = int(payload_headers['expected'])
|
||||
processname = payload_headers['processname']
|
||||
|
||||
# If a critical process exited unexpectedly, terminate supervisor
|
||||
if expected == 0 and processname in critical_processes:
|
||||
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
|
||||
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
|
||||
syslog.syslog(syslog.LOG_INFO, msg)
|
||||
os.kill(os.getppid(), signal.SIGTERM)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -6,7 +6,7 @@ DOCKER_DHCP_RELAY_DBG = $(DOCKER_DHCP_RELAY_STEM)-$(DBG_IMAGE_MARK).gz
|
||||
|
||||
$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/$(DOCKER_DHCP_RELAY_STEM)
|
||||
|
||||
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY)
|
||||
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(REDIS_TOOLS)
|
||||
$(DOCKER_DHCP_RELAY)_DBG_DEPENDS = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_DEPENDS)
|
||||
$(DOCKER_DHCP_RELAY)_DBG_IMAGE_PACKAGES = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_IMAGE_PACKAGES)
|
||||
|
||||
|
@ -34,4 +34,4 @@ $(DOCKER_ORCHAGENT)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
|
||||
$(DOCKER_ORCHAGENT)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
|
||||
|
||||
$(DOCKER_ORCHAGENT)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
|
||||
$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT)
|
||||
$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
|
||||
|
@ -11,9 +11,13 @@ $(BUFFERS_CONFIG_TEMPLATE)_PATH = files/build_templates
|
||||
QOS_CONFIG_TEMPLATE = qos_config.j2
|
||||
$(QOS_CONFIG_TEMPLATE)_PATH = files/build_templates
|
||||
|
||||
SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
|
||||
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts
|
||||
|
||||
SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
|
||||
$(ARP_UPDATE_SCRIPT) \
|
||||
$(BUFFERS_CONFIG_TEMPLATE) \
|
||||
$(QOS_CONFIG_TEMPLATE)
|
||||
$(QOS_CONFIG_TEMPLATE) \
|
||||
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
|
||||
|
||||
|
||||
|
@ -1,43 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
STATE_DB_IDX="6"
|
||||
|
||||
PORT_TABLE_PREFIX="PORT_TABLE"
|
||||
VLAN_TABLE_PREFIX="VLAN_TABLE"
|
||||
LAG_TABLE_PREFIX="LAG_TABLE"
|
||||
|
||||
function wait_until_iface_ready
|
||||
{
|
||||
IFACE=$1
|
||||
TABLE_PREFIX=$1
|
||||
IFACE=$2
|
||||
|
||||
echo "Waiting until interface $IFACE is up..."
|
||||
echo "Waiting until interface $IFACE is ready..."
|
||||
|
||||
# Wait for the interface to come up (i.e., 'ip link show' returns 0)
|
||||
until ip link show dev $IFACE up > /dev/null 2>&1; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Interface $IFACE is up"
|
||||
|
||||
echo "Waiting until interface $IFACE has an IPv4 address..."
|
||||
|
||||
# Wait until the interface gets assigned an IPv4 address
|
||||
# Wait for the interface to come up
|
||||
# (i.e., interface is present in STATE_DB and state is "ok")
|
||||
while true; do
|
||||
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
|
||||
|
||||
if [ -n "$IP" ]; then
|
||||
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
|
||||
if [ x"$RESULT" == x"ok" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Interface $IFACE is configured with IP $IP"
|
||||
echo "Interface ${IFACE} is ready!"
|
||||
}
|
||||
|
||||
|
||||
# Wait for all interfaces to come up and have IPv4 addresses assigned
|
||||
wait_until_iface_ready Vlan1000
|
||||
wait_until_iface_ready PortChannel01
|
||||
wait_until_iface_ready PortChannel01
|
||||
wait_until_iface_ready PortChannel02
|
||||
wait_until_iface_ready PortChannel02
|
||||
wait_until_iface_ready PortChannel03
|
||||
wait_until_iface_ready PortChannel03
|
||||
wait_until_iface_ready PortChannel04
|
||||
wait_until_iface_ready PortChannel04
|
||||
# Wait for all interfaces to be up and ready
|
||||
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
|
||||
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user