[docker-syncd]: Restart SwSS, syncd and dependent services if a critical process in syncd container exits unexpectedly (#3534)

Add the same mechanism I developed for the SwSS service in #2845 to the syncd service. However, in order to cause the SwSS service to also exit and restart in this situation, I developed a docker-wait-any program which the SwSS service uses to wait for either the swss or syncd containers to exit.
This commit is contained in:
Joe LeVeque 2019-11-09 10:26:39 -08:00 committed by lguohan
parent 099f9b0a6a
commit 85b0de3df1
38 changed files with 164 additions and 8 deletions

View File

@ -218,6 +218,9 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
echo "hostname-config.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/
# Copy miscellaneous scripts
sudo cp $IMAGE_CONFIGS/misc/docker-wait-any $FILESYSTEM_ROOT/usr/bin/
# Copy updategraph script and service file
j2 files/build_templates/updategraph.service.j2 | sudo tee $FILESYSTEM_ROOT/etc/systemd/system/updategraph.service
sudo cp $IMAGE_CONFIGS/updategraph/updategraph $FILESYSTEM_ROOT/usr/bin/

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python
"""
docker-wait-any
This script takes one or more Docker container names as arguments,
and it will block indefinitely while all of the specified containers
are running. If any of the specified containers stop, the script will
exit.
This script was created because the 'docker wait' command is lacking
this functionality. It will block until ALL specified containers have
stopped running. Here, we spawn multiple threads and wait on one
container per thread. If any of the threads exit, the entire
application will exit.
NOTE: This script is written against docker-py version 1.6.0. Newer
versions of docker-py have a different API.
"""
import sys
import threading
from docker import Client
# Instantiate a global event to share among our threads
g_thread_exit_event = threading.Event()
def usage():
print("Usage: {} <container_name> [<container_name> ...]".format(sys.argv[0]))
sys.exit(1)
def wait_for_container(docker_client, container_name):
docker_client.wait(container_name)
print("No longer waiting on container '{}'".format(container_name))
# Signal the main thread to exit
g_thread_exit_event.set()
def main():
thread_list = []
docker_client = Client(base_url='unix://var/run/docker.sock')
# Ensure we were passed at least one argument
if len(sys.argv) < 2:
usage()
container_names = sys.argv[1:]
for container_name in container_names:
t = threading.Thread(target=wait_for_container, args=[docker_client, container_name])
t.daemon = True
t.start()
thread_list.append(t)
# Wait until we receive an event signifying one of the containers has stopped
g_thread_exit_event.wait()
sys.exit(0)
if __name__ == '__main__':
main()

View File

@ -131,7 +131,22 @@ start() {
wait() {
start_peer_and_dependent_services
/usr/bin/${SERVICE}.sh wait
# Allow some time for peer container to start
# NOTE: This assumes Docker containers share the same names as their
# corresponding services
for SECS in {1..60}; do
RUNNING=$(docker inspect -f '{{.State.Running}}' ${PEER})
if [[ x"$RUNNING" == x"true" ]]; then
break
else
sleep 1
fi
done
# NOTE: This assumes Docker containers share the same names as their
# corresponding services
/usr/bin/docker-wait-any ${SERVICE} ${PEER}
}
stop() {

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_BFN_RPC = docker-syncd-bfn-rpc.gz
$(DOCKER_SYNCD_BFN_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-bfn-rpc
$(DOCKER_SYNCD_BFN_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_BFN_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_BFN_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -29,6 +29,8 @@ debs/{{ deb }}{{' '}}
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1 @@
syncd

View File

@ -9,7 +9,7 @@ $(DOCKER_SYNCD_BRCM_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSAIMETADATA_DBG) \
$(LIBSAIREDIS_DBG)
endif
$(DOCKER_SYNCD_BRCM_RPC)_FILES += $(DSSERVE) $(BCMCMD)
$(DOCKER_SYNCD_BRCM_RPC)_FILES += $(DSSERVE) $(BCMCMD) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
$(DOCKER_SYNCD_BRCM_RPC)_LOAD_DOCKERS += $(DOCKER_SYNCD_BASE)
SONIC_DOCKER_IMAGES += $(DOCKER_SYNCD_BRCM_RPC)
SONIC_STRETCH_DOCKERS += $(DOCKER_SYNCD_BRCM_RPC)

View File

@ -26,6 +26,8 @@ COPY ["files/dsserve", "files/bcmcmd", "start.sh", "bcmsh", "/usr/bin/"]
RUN chmod +x /usr/bin/dsserve /usr/bin/bcmcmd
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1,2 @@
dsserve
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_CAVM_RPC = docker-syncd-cavm-rpc.gz
$(DOCKER_SYNCD_CAVM_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-cavm-rpc
$(DOCKER_SYNCD_CAVM_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT) $(CAVM_LIBSAI) $(XP_TOOLS) $(REDIS_TOOLS)
$(DOCKER_SYNCD_CAVM_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_CAVM_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_CAVM = docker-syncd-cavm.gz
$(DOCKER_SYNCD_CAVM)_PATH = $(PLATFORM_PATH)/docker-syncd-cavm
$(DOCKER_SYNCD_CAVM)_DEPENDS += $(SYNCD) $(CAVM_LIBSAI) $(XP_TOOLS) $(REDIS_TOOLS)
$(DOCKER_SYNCD_CAVM)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_CAVM)_DEPENDS += $(SYNCD_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -23,6 +23,8 @@ debs/{{ deb }}{{' '}}
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
COPY ["profile.ini", "/etc/ssw/AS7512/"]

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_CENTEC_RPC = docker-syncd-centec-rpc.gz
$(DOCKER_SYNCD_CENTEC_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-centec-rpc
$(DOCKER_SYNCD_CENTEC_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_CENTEC_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_CENTEC_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_CENTEC = docker-syncd-centec.gz
$(DOCKER_SYNCD_CENTEC)_PATH = $(PLATFORM_PATH)/docker-syncd-centec
$(DOCKER_SYNCD_CENTEC)_DEPENDS += $(SYNCD)
$(DOCKER_SYNCD_CENTEC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_CENTEC)_DEPENDS += $(SYNCD_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -24,6 +24,8 @@ RUN apt-get install -f kmod
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -28,6 +28,8 @@ debs/{{ deb }}{{' '}}
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -28,6 +28,8 @@ debs/{{ deb }}{{' '}}
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -23,6 +23,8 @@ debs/{{ deb }}{{' '}}
COPY ["start.sh", "syncd.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -3,6 +3,7 @@
DOCKER_SYNCD_MLNX_RPC = docker-syncd-mlnx-rpc.gz
$(DOCKER_SYNCD_MLNX_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mlnx-rpc
$(DOCKER_SYNCD_MLNX_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
$(DOCKER_SYNCD_MLNX_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
ifeq ($(INSTALL_DEBUG_TOOLS), y)
$(DOCKER_SYNCD_MLNX_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
$(LIBSWSSCOMMON_DBG) \

View File

@ -35,5 +35,7 @@ RUN apt-get clean -y && \
COPY ["start.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
ENTRYPOINT ["/usr/bin/supervisord"]

View File

@ -0,0 +1 @@
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -36,6 +36,8 @@ COPY ["files/dsserve", "files/npx_diag", "start.sh", "/usr/bin/"]
RUN chmod +x /usr/bin/npx_diag /usr/bin/dsserve
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]
## Clean up
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y

View File

@ -0,0 +1,2 @@
dsserve
syncd

View File

@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected
[program:start.sh]
command=/usr/bin/start.sh
priority=1
@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

View File

@ -7,6 +7,8 @@ DOCKER_SYNCD_BASE_DBG = $(DOCKER_SYNCD_BASE_STEM)-$(DBG_IMAGE_MARK).gz
$(DOCKER_SYNCD_BASE)_PATH = $(PLATFORM_PATH)/docker-syncd-$(DOCKER_SYNCD_PLATFORM_CODE)
$(DOCKER_SYNCD_BASE)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
$(DOCKER_SYNCD_BASE)_LOAD_DOCKERS += $(DOCKER_CONFIG_ENGINE_STRETCH)
$(DOCKER_SYNCD_BASE)_DBG_DEPENDS += $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_DEPENDS)