Revert "[201803] [services] Restart SwSS service upon unexpected critical process exit (#2546)"

This reverts commit 2a8af2705e.
This commit is contained in:
Guohan Lu 2019-04-06 22:21:30 +00:00
parent 968a0dfbd0
commit 9299a249d3
19 changed files with 60 additions and 127 deletions

View File

@ -1,40 +1,42 @@
#!/usr/bin/env bash #!/usr/bin/env bash
STATE_DB_IDX="6"
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"
function wait_until_iface_ready function wait_until_iface_ready
{ {
TABLE_PREFIX=$1 IFACE=$1
IFACE=$2
echo "Waiting until interface $IFACE is ready..." echo "Waiting until interface $IFACE is up..."
# Wait for the interface to come up # Wait for the interface to come up (i.e., 'ip link show' returns 0)
# (i.e., interface is present in STATE_DB and state is "ok") until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
echo "Interface $IFACE is up"
echo "Waiting until interface $IFACE has an IPv4 address..."
# Wait until the interface gets assigned an IPv4 address
while true; do while true; do
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
if [ x"$RESULT" == x"ok" ]; then
if [ -n "$IP" ]; then
break break
fi fi
sleep 1 sleep 1
done done
echo "Interface ${IFACE} is ready!" echo "Interface $IFACE is configured with IP $IP"
} }
# Wait for all interfaces to be up and ready # Wait for all interfaces to come up and have IPv4 addresses assigned
{% for (name, prefix) in INTERFACE %} {% for (name, prefix) in INTERFACE %}
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }} wait_until_iface_ready {{ name }}
{% endfor %} {% endfor %}
{% for (name, prefix) in VLAN_INTERFACE %} {% for (name, prefix) in VLAN_INTERFACE %}
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }} wait_until_iface_ready {{ name }}
{% endfor %} {% endfor %}
{% for (name, prefix) in PORTCHANNEL_INTERFACE %} {% for (name, prefix) in PORTCHANNEL_INTERFACE %}
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }} wait_until_iface_ready {{ name }}
{% endfor %} {% endfor %}

View File

@ -30,8 +30,6 @@ COPY ["files/arp_update", "/usr/bin"]
COPY ["enable_counters.py", "/usr/bin"] COPY ["enable_counters.py", "/usr/bin"]
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"] COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Copy all Jinja2 template files into the templates folder ## Copy all Jinja2 template files into the templates folder
COPY ["*.j2", "/usr/share/sonic/templates/"] COPY ["*.j2", "/usr/share/sonic/templates/"]

View File

@ -1,7 +0,0 @@
orchagent
portsyncd
intfsyncd
neighsyncd
vlanmgrd
intfmgrd
buffermgrd

View File

@ -3,12 +3,6 @@ logfile_maxbytes=1MB
logfile_backups=2 logfile_backups=2
nodaemon=true nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh] [program:start.sh]
command=/usr/bin/start.sh command=/usr/bin/start.sh
priority=1 priority=1
@ -21,7 +15,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n command=/usr/sbin/rsyslogd -n
priority=2 priority=2
autostart=false autostart=false
autorestart=unexpected autorestart=false
stdout_logfile=syslog stdout_logfile=syslog
stderr_logfile=syslog stderr_logfile=syslog

View File

@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
[Install] [Install]
WantedBy=multi-user.target swss.service teamd.service WantedBy=multi-user.target teamd.service

View File

@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
[Install] [Install]
WantedBy=multi-user.target swss.service WantedBy=multi-user.target

View File

@ -8,6 +8,3 @@ Before=ntp-config.service
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
ExecStart=/usr/bin/{{docker_container_name}}.sh attach ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop ExecStop=/usr/bin/{{docker_container_name}}.sh stop
[Install]
WantedBy=multi-user.target swss.service

View File

@ -14,8 +14,6 @@ After=opennsl-modules-3.16.0-6-amd64.service
After=nps-modules-3.16.0-6-amd64.service After=nps-modules-3.16.0-6-amd64.service
{% endif %} {% endif %}
Before=ntp-config.service Before=ntp-config.service
StartLimitInterval=1200
StartLimitBurst=3
[Service] [Service]
User=root User=root
@ -54,8 +52,6 @@ ExecStopPost=/usr/bin/mst stop
ExecStopPost=/etc/init.d/xpnet.sh stop ExecStopPost=/etc/init.d/xpnet.sh stop
ExecStopPost=/etc/init.d/xpnet.sh start ExecStopPost=/etc/init.d/xpnet.sh start
{% endif %} {% endif %}
Restart=always
RestartSec=30
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -1,7 +1,7 @@
[Unit] [Unit]
Description=TEAMD container Description=TEAMD container
Requires=updategraph.service swss.service Requires=updategraph.service
After=updategraph.service swss.service After=updategraph.service
Before=ntp-config.service Before=ntp-config.service
[Service] [Service]
@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop ExecStop=/usr/bin/{{docker_container_name}}.sh stop
[Install] [Install]
WantedBy=multi-user.target swss.service WantedBy=multi-user.target

View File

@ -1,45 +0,0 @@
#!/usr/bin/env python
import os
import signal
import sys
import syslog
from supervisor import childutils
# Contents of file should be the names of critical processes (as defined in
# supervisor.conf file), one per line
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
def main():
# Read the list of critical processes from a file
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
critical_processes = [line.rstrip('\n') for line in f]
while True:
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
line = sys.stdin.readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))
# Transition from READY to ACKNOWLEDGED
childutils.listener.ok()
# We only care about PROCESS_STATE_EXITED events
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
expected = int(payload_headers['expected'])
processname = payload_headers['processname']
# If a critical process exited unexpectedly, terminate supervisor
if expected == 0 and processname in critical_processes:
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)
if __name__ == "__main__":
main()

View File

@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
$(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
$(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
$(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -15,4 +15,4 @@ $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /host/machine.conf:/host/machine.conf
$(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
$(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
$(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel $(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT)

View File

@ -2,7 +2,7 @@
DOCKER_DHCP_RELAY = docker-dhcp-relay.gz DOCKER_DHCP_RELAY = docker-dhcp-relay.gz
$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay $(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) $(REDIS_TOOLS) $(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT)
$(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE) $(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE)
SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY) SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY) SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)

View File

@ -5,11 +5,7 @@ $(ARP_UPDATE_SCRIPT)_PATH = files/scripts
CONFIGDB_LOAD_SCRIPT = configdb-load.sh CONFIGDB_LOAD_SCRIPT = configdb-load.sh
$(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts $(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts
SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts
SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \ SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
$(ARP_UPDATE_SCRIPT) \ $(ARP_UPDATE_SCRIPT)
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

View File

@ -1,41 +1,43 @@
#!/usr/bin/env bash #!/usr/bin/env bash
STATE_DB_IDX="6"
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"
function wait_until_iface_ready function wait_until_iface_ready
{ {
TABLE_PREFIX=$1 IFACE=$1
IFACE=$2
echo "Waiting until interface $IFACE is ready..." echo "Waiting until interface $IFACE is up..."
# Wait for the interface to come up # Wait for the interface to come up (i.e., 'ip link show' returns 0)
# (i.e., interface is present in STATE_DB and state is "ok") until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
echo "Interface $IFACE is up"
echo "Waiting until interface $IFACE has an IPv4 address..."
# Wait until the interface gets assigned an IPv4 address
while true; do while true; do
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
if [ x"$RESULT" == x"ok" ]; then
if [ -n "$IP" ]; then
break break
fi fi
sleep 1 sleep 1
done done
echo "Interface ${IFACE} is ready!" echo "Interface $IFACE is configured with IP $IP"
} }
# Wait for all interfaces to be up and ready # Wait for all interfaces to come up and have IPv4 addresses assigned
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000 wait_until_iface_ready Vlan1000
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 wait_until_iface_ready PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 wait_until_iface_ready PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 wait_until_iface_ready PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 wait_until_iface_ready PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01 wait_until_iface_ready PortChannel01
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 wait_until_iface_ready PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 wait_until_iface_ready PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01 wait_until_iface_ready PortChannel01