From 3ac345922bca76f80d11f1f18efa6529d6204259 Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Tue, 11 Feb 2020 14:03:02 -0800 Subject: [PATCH] [Services] Restart database service upon unexpected critical process exit. (#4138) * [database] Implement the auto-restart feature for database container. Signed-off-by: Yong Zhao * [database] Remove the duplicate dependency in service files. Since we already have updategraph ---> config_setup ---> database, we do not need explicitly add database.service in all other container service files. Signed-off-by: Yong Zhao * [event listener] Reorganize the line 73 in event listener script. Signed-off-by: Yong Zhao * [database] update the file sflow.service.j2 to remove the duplicate dependency. Signed-off-by: Yong Zhao * [event listener] Add comments in event listener. Signed-off-by: Yong Zhao * [event listener] Update the comments in line 56. Signed-off-by: Yong Zhao * [event listener] Add parentheses for if statement in line 76 in event listener. Signed-off-by: Yong Zhao --- dockers/docker-database/Dockerfile.j2 | 2 ++ dockers/docker-database/critical_processes | 1 + dockers/docker-database/supervisord.conf.j2 | 7 ++++ .../single_instance/database.service.j2 | 4 +++ files/scripts/supervisor-proc-exit-listener | 34 +++++++++++-------- rules/docker-database.mk | 1 + 6 files changed, 34 insertions(+), 15 deletions(-) create mode 100644 dockers/docker-database/critical_processes diff --git a/dockers/docker-database/Dockerfile.j2 b/dockers/docker-database/Dockerfile.j2 index acb5e013fb..8cd1816146 100644 --- a/dockers/docker-database/Dockerfile.j2 +++ b/dockers/docker-database/Dockerfile.j2 @@ -36,5 +36,7 @@ COPY ["supervisord.conf.j2", "/usr/share/sonic/templates/"] COPY ["docker-database-init.sh", "/usr/local/bin/"] COPY ["ping_pong_db_insts", "/usr/local/bin/"] COPY ["database_config.json", "/etc/default/sonic-db/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor"] ENTRYPOINT ["/usr/local/bin/docker-database-init.sh"] diff --git a/dockers/docker-database/critical_processes b/dockers/docker-database/critical_processes new file mode 100644 index 0000000000..7800f0fad3 --- /dev/null +++ b/dockers/docker-database/critical_processes @@ -0,0 +1 @@ +redis diff --git a/dockers/docker-database/supervisord.conf.j2 b/dockers/docker-database/supervisord.conf.j2 index 110619f762..442bec1438 100644 --- a/dockers/docker-database/supervisord.conf.j2 +++ b/dockers/docker-database/supervisord.conf.j2 @@ -3,6 +3,13 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name database +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + + [program:rsyslogd] command=/bin/bash -c "rm -f /var/run/rsyslogd.pid && /usr/sbin/rsyslogd -n" priority=1 diff --git a/files/build_templates/single_instance/database.service.j2 b/files/build_templates/single_instance/database.service.j2 index 472b9d328b..fd0063195e 100644 --- a/files/build_templates/single_instance/database.service.j2 +++ b/files/build_templates/single_instance/database.service.j2 @@ -3,12 +3,16 @@ Description=Database container Requires=docker.service After=docker.service After=rc-local.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 [Service] User=root ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index cf26d53830..cf154b3a5c 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -52,24 +52,28 @@ def main(argv): processname = payload_headers['processname'] groupname = payload_headers['groupname'] - config_db = swsssdk.ConfigDBConnector() - config_db.connect() - container_features_table = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) - if not container_features_table: - syslog.syslog(syslog.LOG_ERR, "Unable to retrieve container features table from Config DB. Exiting...") - sys.exit(2) + # Read the status of auto-restart feature from Config_DB. + if container_name != 'database': + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + container_features_table = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) + if not container_features_table: + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve container features table from Config DB. Exiting...") + sys.exit(2) - if not container_features_table.has_key(container_name): - syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features for container '{}'. Exiting...".format(container_name)) - sys.exit(3) + if not container_features_table.has_key(container_name): + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features for container '{}'. Exiting...".format(container_name)) + sys.exit(3) - restart_feature = container_features_table[container_name].get('auto_restart') - if not restart_feature: - syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) - sys.exit(4) + restart_feature = container_features_table[container_name].get('auto_restart') + if not restart_feature: + syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) + sys.exit(4) - # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor - if restart_feature == 'enabled' and expected == 0 and (processname in critical_processes or groupname in critical_processes): + # If container is database or auto-restart feature is enabled and at the same time + # a critical process exited unexpectedly, terminate supervisor + if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and + (processname in critical_processes or groupname in critical_processes)): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) diff --git a/rules/docker-database.mk b/rules/docker-database.mk index 91fd06819a..7e372048af 100644 --- a/rules/docker-database.mk +++ b/rules/docker-database.mk @@ -28,3 +28,4 @@ $(DOCKER_DATABASE)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_DATABASE)_BASE_IMAGE_FILES += redis-cli:/usr/bin/redis-cli $(DOCKER_DATABASE)_BASE_IMAGE_FILES += monit_database:/etc/monit/conf.d +$(DOCKER_DATABASE)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)