[dockers] Update critical_processes file syntax (#4831)

**- Why I did it**
Initially, the critical_processes file contains either the name of critical process or the name of group.
For example, the critical_processes file in the dhcp_relay container contains a single group name
`isc-dhcp-relay`. When testing the autorestart feature of each container, we need get all the critical
processes and test whether a  container can be restarted correctly if one of its critical processes is
killed. However, it will be difficult to differentiate whether the names in the critical_processes file are
the critical processes or group names. At the same time, changing the syntax in this file will separate the individual process from the groups and also makes it clear to the user.

Right now the critical_processes file contains two different kind of entries. One is "program:xxx" which indicates a critical process. Another is "group:xxx" which indicates a group of critical processes
managed by supervisord using the name "xxx". At the same time, I also updated the logic to
parse the file critical_processes in supervisor-proc-event-listener script.

**- How to verify it**
We can first enable the autorestart feature of a specified container for example `dhcp_relay` by running the comman `sudo config container feature autorestart dhcp_relay enabled` on DUT. Then we can select a critical process from the command `docker top dhcp_relay` and use the command `sudo kill -SIGKILL <pid>` to kill that critical process. Final step is to check whether the container is restarted correctly or not.
This commit is contained in:
yozhao101 2020-06-25 21:18:21 -07:00 committed by GitHub
parent 921d132a32
commit 4fa81b4f8d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 84 additions and 58 deletions

View File

@ -1 +1 @@
redis program:redis

View File

@ -1 +1 @@
isc-dhcp-relay group:isc-dhcp-relay

View File

@ -1,5 +1,5 @@
zebra program:zebra
staticd program:staticd
bgpd program:bgpd
fpmsyncd program:fpmsyncd
bgpcfgd program:bgpcfgd

View File

@ -1,2 +1,2 @@
gobgpd program:gobgpd
fpmsyncd program:fpmsyncd

View File

@ -1,4 +1,4 @@
zebra program:zebra
bgpd program:bgpd
fpmsyncd program:fpmsyncd
bgpcfgd program:bgpcfgd

View File

@ -1,3 +1,3 @@
lldpd program:lldpd
lldp-syncd program:lldp_syncd
lldpmgrd program:lldpmgrd

View File

@ -1,2 +1,2 @@
natmgrd program:natmgrd
natsyncd program:natsyncd

View File

@ -1,10 +1,10 @@
orchagent program:orchagent
portsyncd program:portsyncd
neighsyncd program:neighsyncd
vlanmgrd program:vlanmgrd
intfmgrd program:intfmgrd
portmgrd program:portmgrd
buffermgrd program:buffermgrd
vrfmgrd program:vrfmgrd
nbrmgrd program:nbrmgrd
vxlanmgrd program:vxlanmgrd

View File

@ -1,3 +1,3 @@
ledd program:ledd
xcvrd program:xcvrd
psud program:psud

View File

@ -1 +1 @@
radvd program:radvd

View File

@ -1 +1 @@
sflowmgrd program:sflowmgrd

View File

@ -1,2 +1,2 @@
snmpd program:snmpd
snmp-subagent program:snmp-subagent

View File

@ -1 +1 @@
restapi program:restapi

View File

@ -1,2 +1,2 @@
telemetry program:telemetry
dialout program:dialout

View File

@ -1,2 +1,2 @@
teammgrd program:teammgrd
teamsyncd program:teamsyncd

View File

@ -10,14 +10,42 @@ import swsssdk
from supervisor import childutils from supervisor import childutils
# Contents of file should be the names of critical processes (as defined in # Each line of this file should specify either one critical process or one
# supervisor.conf file), one per line # critical process group, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
# group:<group_name>
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
# This table in databse contains the features for container and each # This table in databse contains the features for container and each
# feature for a row will be configured a state or number. # feature for a row will be configured a state or number.
CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE' CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE'
# Read the critical processes/group names from CRITICAL_PROCESSES_FILE
def get_critical_group_and_process_list():
critical_group_list = []
critical_process_list = []
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
for line in file:
line_info = line.strip(' \n').split(':')
if len(line_info) != 2:
syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
sys.exit(5)
identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip()
if identifier_key == "group" and identifier_value:
critical_group_list.append(identifier_value)
elif identifier_key == "program" and identifier_value:
critical_process_list.append(identifier_value)
else:
syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
sys.exit(6)
return critical_group_list, critical_process_list
def main(argv): def main(argv):
container_name = None container_name = None
opts, args = getopt.getopt(argv, "c:", ["container-name="]) opts, args = getopt.getopt(argv, "c:", ["container-name="])
@ -29,9 +57,7 @@ def main(argv):
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
sys.exit(1) sys.exit(1)
# Read the list of critical processes from a file critical_group_list, critical_process_list = get_critical_group_and_process_list()
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
critical_processes = [line.rstrip('\n') for line in f]
while True: while True:
# Transition from ACKNOWLEDGED to READY # Transition from ACKNOWLEDGED to READY
@ -73,7 +99,7 @@ def main(argv):
# If container is database or auto-restart feature is enabled and at the same time # If container is database or auto-restart feature is enabled and at the same time
# a critical process exited unexpectedly, terminate supervisor # a critical process exited unexpectedly, terminate supervisor
if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and
(processname in critical_processes or groupname in critical_processes)): (processname in critical_process_list or groupname in critical_group_list)):
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname']) msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg) syslog.syslog(syslog.LOG_INFO, msg)

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1,2 +1,2 @@
dsserve program:dsserve
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd

View File

@ -1,2 +1,2 @@
dsserve program:dsserve
syncd program:syncd

View File

@ -1 +1 @@
syncd program:syncd