From 71225ea4cccd3fe793a6d3536b2e607e5e31268d Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Fri, 7 Feb 2020 12:34:07 -0800 Subject: [PATCH] [Service] Enable/disable container auto-restart based on configuration. (#4073) --- .../docker-dhcp-relay.supervisord.conf.j2 | 2 +- dockers/docker-lldp-sv2/supervisord.conf | 2 +- dockers/docker-orchagent/supervisord.conf | 2 +- .../docker-pmon.supervisord.conf.j2 | 2 +- .../docker-router-advertiser.supervisord.conf | 2 +- dockers/docker-sflow/supervisord.conf | 2 +- dockers/docker-snmp-sv2/supervisord.conf | 2 +- .../docker-sonic-telemetry/supervisord.conf | 2 +- dockers/docker-teamd/supervisord.conf | 2 +- files/scripts/supervisor-proc-exit-listener | 42 +++++++++++++++++-- .../docker-syncd-bfn/supervisord.conf | 6 +++ .../docker-syncd-brcm/supervisord.conf | 2 +- .../cavium/docker-syncd-cavm/supervisord.conf | 2 +- .../docker-syncd-centec/supervisord.conf | 2 +- .../docker-syncd-mrvl/supervisord.conf | 6 +++ .../docker-syncd-mrvl/supervisord.conf | 6 +++ .../docker-syncd-mrvl/supervisord.conf | 2 +- .../docker-syncd-mlnx/supervisord.conf | 2 +- .../docker-syncd-nephos/supervisord.conf | 2 +- .../docker-dhcp-relay.supervisord.conf | 2 +- 20 files changed, 72 insertions(+), 20 deletions(-) diff --git a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 index 94fdbfdaff..d524004807 100644 --- a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 +++ b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name dhcp_relay events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-lldp-sv2/supervisord.conf b/dockers/docker-lldp-sv2/supervisord.conf index 3f3f5beabc..73ff52f442 100644 --- a/dockers/docker-lldp-sv2/supervisord.conf +++ b/dockers/docker-lldp-sv2/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name lldp events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-orchagent/supervisord.conf b/dockers/docker-orchagent/supervisord.conf index 9ae2776f6d..6b21d73f3c 100644 --- a/dockers/docker-orchagent/supervisord.conf +++ b/dockers/docker-orchagent/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name swss events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 index 9a2414c30d..13ae0e767a 100644 --- a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 +++ b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name pmon events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-router-advertiser/docker-router-advertiser.supervisord.conf b/dockers/docker-router-advertiser/docker-router-advertiser.supervisord.conf index 4ea84ab11c..bf9320acc7 100644 --- a/dockers/docker-router-advertiser/docker-router-advertiser.supervisord.conf +++ b/dockers/docker-router-advertiser/docker-router-advertiser.supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-script] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name radv events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-sflow/supervisord.conf b/dockers/docker-sflow/supervisord.conf index 50986f197d..8eb1bdc05e 100644 --- a/dockers/docker-sflow/supervisord.conf +++ b/dockers/docker-sflow/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name sflow events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-snmp-sv2/supervisord.conf b/dockers/docker-snmp-sv2/supervisord.conf index 7fd16eec5b..9922923305 100644 --- a/dockers/docker-snmp-sv2/supervisord.conf +++ b/dockers/docker-snmp-sv2/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name snmp events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/dockers/docker-sonic-telemetry/supervisord.conf b/dockers/docker-sonic-telemetry/supervisord.conf index e1346fe7db..54f4c5b234 100644 --- a/dockers/docker-sonic-telemetry/supervisord.conf +++ b/dockers/docker-sonic-telemetry/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name telemetry events=PROCESS_STATE_EXITED autostart=true autorestart=false diff --git a/dockers/docker-teamd/supervisord.conf b/dockers/docker-teamd/supervisord.conf index 43d6d2d6d1..0c3071bbfd 100644 --- a/dockers/docker-teamd/supervisord.conf +++ b/dockers/docker-teamd/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name teamd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 8d1735cd2b..cf26d53830 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -1,17 +1,34 @@ #!/usr/bin/env python +import getopt import os import signal import sys import syslog +import swsssdk + from supervisor import childutils # Contents of file should be the names of critical processes (as defined in # supervisor.conf file), one per line CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' -def main(): +# This table in databse contains the features for container and each +# feature for a row will be configured a state or number. +CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE' + +def main(argv): + container_name = None + opts, args = getopt.getopt(argv, "c:", ["container-name="]) + for opt, arg in opts: + if opt in ("-c", "--container-name"): + container_name = arg + + if not container_name: + syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") + sys.exit(1) + # Read the list of critical processes from a file with open(CRITICAL_PROCESSES_FILE, 'r') as f: critical_processes = [line.rstrip('\n') for line in f] @@ -35,12 +52,29 @@ def main(): processname = payload_headers['processname'] groupname = payload_headers['groupname'] - # If a critical process exited unexpectedly, terminate supervisor - if expected == 0 and processname in critical_processes or groupname in critical_processes: + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + container_features_table = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) + if not container_features_table: + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve container features table from Config DB. Exiting...") + sys.exit(2) + + if not container_features_table.has_key(container_name): + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features for container '{}'. Exiting...".format(container_name)) + sys.exit(3) + + restart_feature = container_features_table[container_name].get('auto_restart') + if not restart_feature: + syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) + sys.exit(4) + + # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor + if restart_feature == 'enabled' and expected == 0 and (processname in critical_processes or groupname in critical_processes): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) os.kill(os.getppid(), signal.SIGTERM) + if __name__ == "__main__": - main() + main(sys.argv[1:]) diff --git a/platform/barefoot/docker-syncd-bfn/supervisord.conf b/platform/barefoot/docker-syncd-bfn/supervisord.conf index 1e015fef93..1744d6ffef 100644 --- a/platform/barefoot/docker-syncd-bfn/supervisord.conf +++ b/platform/barefoot/docker-syncd-bfn/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/platform/broadcom/docker-syncd-brcm/supervisord.conf b/platform/broadcom/docker-syncd-brcm/supervisord.conf index cd6712acbf..3fa8febb85 100644 --- a/platform/broadcom/docker-syncd-brcm/supervisord.conf +++ b/platform/broadcom/docker-syncd-brcm/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/platform/cavium/docker-syncd-cavm/supervisord.conf b/platform/cavium/docker-syncd-cavm/supervisord.conf index c823ab5680..0c6285d46a 100644 --- a/platform/cavium/docker-syncd-cavm/supervisord.conf +++ b/platform/cavium/docker-syncd-cavm/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/platform/centec/docker-syncd-centec/supervisord.conf b/platform/centec/docker-syncd-centec/supervisord.conf index c823ab5680..0c6285d46a 100644 --- a/platform/centec/docker-syncd-centec/supervisord.conf +++ b/platform/centec/docker-syncd-centec/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/platform/marvell-arm64/docker-syncd-mrvl/supervisord.conf b/platform/marvell-arm64/docker-syncd-mrvl/supervisord.conf index 1af5d70a1d..b11e045fac 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/supervisord.conf +++ b/platform/marvell-arm64/docker-syncd-mrvl/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/platform/marvell-armhf/docker-syncd-mrvl/supervisord.conf b/platform/marvell-armhf/docker-syncd-mrvl/supervisord.conf index 1af5d70a1d..b11e045fac 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/supervisord.conf +++ b/platform/marvell-armhf/docker-syncd-mrvl/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/platform/marvell/docker-syncd-mrvl/supervisord.conf b/platform/marvell/docker-syncd-mrvl/supervisord.conf index aea4d45b9a..43de2426f9 100644 --- a/platform/marvell/docker-syncd-mrvl/supervisord.conf +++ b/platform/marvell/docker-syncd-mrvl/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/platform/mellanox/docker-syncd-mlnx/supervisord.conf b/platform/mellanox/docker-syncd-mlnx/supervisord.conf index c823ab5680..0c6285d46a 100644 --- a/platform/mellanox/docker-syncd-mlnx/supervisord.conf +++ b/platform/mellanox/docker-syncd-mlnx/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/platform/nephos/docker-syncd-nephos/supervisord.conf b/platform/nephos/docker-syncd-nephos/supervisord.conf index c823ab5680..0c6285d46a 100644 --- a/platform/nephos/docker-syncd-nephos/supervisord.conf +++ b/platform/nephos/docker-syncd-nephos/supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name syncd events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected diff --git a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf index a29982a646..fde1d6c771 100644 --- a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf +++ b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf @@ -4,7 +4,7 @@ logfile_backups=2 nodaemon=true [eventlistener:supervisor-proc-exit-listener] -command=/usr/bin/supervisor-proc-exit-listener +command=/usr/bin/supervisor-proc-exit-listener --container-name dhcp_relay events=PROCESS_STATE_EXITED autostart=true autorestart=unexpected