From 963bd7fdc4fe74af8da7076f41f8740e93e38ce8 Mon Sep 17 00:00:00 2001 From: judyjoseph <53951155+judyjoseph@users.noreply.github.com> Date: Fri, 23 Oct 2020 00:41:16 -0700 Subject: [PATCH] [docker-teamd]: Add teamd as a depedent service to swss (#5628) **- Why I did it** On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present. **- How I did it** Add the teamd as a dependent service for swss Updated the docker-wait script to handle service and dependent services separately. Handle the case of warm-restart for the dependent service **- How to verify it** Verified the following scenario's with the following testbed VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs 1. Stop teamd docker alone > swss, syncd dockers seen going away > The LAG reference count error messages seen for a while till swss docker stops. > Dockers back up. 2. Enable WR mode for teamd. Stop teamd docker alone > swss, syncd dockers not removed. > The LAG reference count error messages not seen > Repeated stop teamd docker test - same result, no effect on swss/syncd. 3. Stop swss docker. > swss, teamd, syncd goes off - dockers comes back correctly, interfaces up 4. Enable WR mode for swss . Stop swss docker > swss goes off not affecting syncd/teamd dockers. 5. Config reload > no reference counter error seen, dockers comes back correctly, with interfaces up 6. Warm reboot, observations below > swss docker goes off first > teamd + syncd goes off to the end of WR process. > dockers comes back up fine. > ping traffic between VM's was NOT HIT 7. Fast reboot, observations below > teamd goes off first ( **confirmed swss don't exit here** ) > swss goes off next > syncd goes away at the end of the FR process > dockers comes back up fine. > there is a traffic HIT as per fast-reboot 8. Verified in multi-asic platform, the tests above other than WR/FB scenarios --- files/image_config/misc/docker-wait-any | 86 ++++++++++++++----- files/scripts/swss.sh | 27 +++++- .../sonic_py_common/device_info.py | 37 +++++++- 3 files changed, 126 insertions(+), 24 deletions(-) diff --git a/files/image_config/misc/docker-wait-any b/files/image_config/misc/docker-wait-any index 3988a9fbdf..6c764fb786 100755 --- a/files/image_config/misc/docker-wait-any +++ b/files/image_config/misc/docker-wait-any @@ -3,50 +3,96 @@ """ docker-wait-any This script takes one or more Docker container names as arguments, - and it will block indefinitely while all of the specified containers - are running. If any of the specified containers stop, the script will + [-s] argument is for the service which invokes this script + [-d] argument is to list the dependent services for the above service. + It will block indefinitely while all of the specified containers + are running.If any of the specified containers stop, the script will exit. + This script was created because the 'docker wait' command is lacking this functionality. It will block until ALL specified containers have stopped running. Here, we spawn multiple threads and wait on one container per thread. If any of the threads exit, the entire - application will exit. - NOTE: This script is written against docker-py version 1.6.0. Newer - versions of docker-py have a different API. -""" + application will exit, unless we are in a scenario where the following + conditions are met. + (i) the container is a dependent service + (ii) warm restart is enabled at system level or for that container OR + fast reboot is enabled system level + In this scenario, the g_thread_exit_event won't be propogated to the parent, + instead the thread will continue to do docker_client.wait again.This help's + cases where we need the dependent container to be warm-restarted without + affecting other services (eg: warm restart of teamd service) + NOTE: This script is written against docker Python package 4.1.0. Newer + versions of docker may have a different API. +""" +import argparse import sys import threading from docker import Client +import time + +from docker import APIClient +from sonic_py_common import logger, device_info + +SYSLOG_IDENTIFIER = 'docker-wait-any' + +# Global logger instance +log = logger.Logger(SYSLOG_IDENTIFIER) # Instantiate a global event to share among our threads g_thread_exit_event = threading.Event() - - -def usage(): - print("Usage: {} [ ...]".format(sys.argv[0])) - sys.exit(1) - +g_service = [] +g_dep_services = [] def wait_for_container(docker_client, container_name): - docker_client.wait(container_name) + while True: + while docker_client.inspect_container(container_name)['State']['Status'] != "running": + time.sleep(1) - print("No longer waiting on container '{}'".format(container_name)) + docker_client.wait(container_name) - # Signal the main thread to exit - g_thread_exit_event.set() + log.log_info("No longer waiting on container '{}'".format(container_name)) + # If this is a dependent service and warm restart is enabled for the system/container, + # OR if the system is going through a fast-reboot, DON'T signal main thread to exit + if (container_name in g_dep_services and + (device_info.is_warm_restart_enabled(container_name) or device_info.is_fast_reboot_enabled())): + continue + + # Signal the main thread to exit + g_thread_exit_event.set() def main(): thread_list = [] docker_client = Client(base_url='unix://var/run/docker.sock') - # Ensure we were passed at least one argument - if len(sys.argv) < 2: - usage() + parser = argparse.ArgumentParser(description='Wait for dependent docker services', + version='1.0.0', + formatter_class=argparse.RawTextHelpFormatter, + epilog=""" +Examples: + docker-wait-any -s swss -d syncd teamd +""") - container_names = sys.argv[1:] + parser.add_argument('-s','--service', nargs='+', default=None, help='name of the service') + parser.add_argument('-d','--dependent', nargs='*', default=None, help='other dependent services') + args = parser.parse_args() + + global g_service + global g_dep_services + + if args.service is not None: + g_service = args.service + if args.dependent is not None: + g_dep_services = args.dependent + + container_names = g_service + g_dep_services + + # If the service and dependents passed as args is empty, then exit + if container_names == []: + sys.exit(0) for container_name in container_names: t = threading.Thread(target=wait_for_container, args=[docker_client, container_name]) diff --git a/files/scripts/swss.sh b/files/scripts/swss.sh index 74828c740f..9053243872 100755 --- a/files/scripts/swss.sh +++ b/files/scripts/swss.sh @@ -161,7 +161,20 @@ wait() { else RUNNING=$(docker inspect -f '{{.State.Running}}' ${PEER}) fi - if [[ x"$RUNNING" == x"true" ]]; then + ALL_DEPS_RUNNING=true + for dep in ${MULTI_INST_DEPENDENT}; do + if [[ ! -z $DEV ]]; then + DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep}$DEV) + else + DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep}) + fi + if [[ x"$DEP_RUNNING" != x"true" ]]; then + ALL_DEPS_RUNNING=false + break + fi + done + + if [[ x"$RUNNING" == x"true" && x"$ALL_DEPS_RUNNING" == x"true" ]]; then break else sleep 1 @@ -170,10 +183,18 @@ wait() { # NOTE: This assumes Docker containers share the same names as their # corresponding services + for dep in ${MULTI_INST_DEPENDENT}; do + if [[ ! -z $DEV ]]; then + ALL_DEPS="$ALL_DEPS ${dep}$DEV" + else + ALL_DEPS="$ALL_DEPS ${dep}" + fi + done + if [[ ! -z $DEV ]]; then - /usr/bin/docker-wait-any ${SERVICE}$DEV ${PEER}$DEV + /usr/bin/docker-wait-any -s ${SERVICE}$DEV -d ${PEER}$DEV ${ALL_DEPS} else - /usr/bin/docker-wait-any ${SERVICE} ${PEER} + /usr/bin/docker-wait-any -s ${SERVICE} -d ${PEER} ${ALL_DEPS} fi } diff --git a/src/sonic-py-common/sonic_py_common/device_info.py b/src/sonic-py-common/sonic_py_common/device_info.py index 1393e19589..7f75ea6cdc 100644 --- a/src/sonic-py-common/sonic_py_common/device_info.py +++ b/src/sonic-py-common/sonic_py_common/device_info.py @@ -9,7 +9,7 @@ import subprocess from natsort import natsorted # TODD: Replace with swsscommon -from swsssdk import ConfigDBConnector, SonicDBConfig +from swsssdk import ConfigDBConnector, SonicDBConfig, SonicV2Connector USR_SHARE_SONIC_PATH = "/usr/share/sonic" HOST_DEVICE_PATH = USR_SHARE_SONIC_PATH + "/device" @@ -363,3 +363,38 @@ def get_system_routing_stack(): raise OSError("Cannot detect routing stack") return result + +# Check if System warm reboot or Container warm restart is enabled. +def is_warm_restart_enabled(container_name): + state_db = SonicV2Connector(host='127.0.0.1') + state_db.connect(state_db.STATE_DB, False) + + TABLE_NAME_SEPARATOR = '|' + prefix = 'WARM_RESTART_ENABLE_TABLE' + TABLE_NAME_SEPARATOR + + # Get the system warm reboot enable state + _hash = '{}{}'.format(prefix, 'system') + wr_system_state = state_db.get(state_db.STATE_DB, _hash, "enable") + wr_enable_state = True if wr_system_state == "true" else False + + # Get the container warm reboot enable state + _hash = '{}{}'.format(prefix, container_name) + wr_container_state = state_db.get(state_db.STATE_DB, _hash, "enable") + wr_enable_state |= True if wr_container_state == "true" else False + + state_db.close(state_db.STATE_DB) + return wr_enable_state + +# Check if System fast reboot is enabled. +def is_fast_reboot_enabled(): + fb_system_state = 0 + cmd = 'sonic-db-cli STATE_DB get "FAST_REBOOT|system"' + proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + (stdout, stderr) = proc.communicate() + + if proc.returncode != 0: + log.log_error("Error running command '{}'".format(cmd)) + elif stdout: + fb_system_state = stdout.rstrip('\n') + + return fb_system_state