[dockers] Update critical_processes file syntax (#4831)
**- Why I did it** Initially, the critical_processes file contains either the name of critical process or the name of group. For example, the critical_processes file in the dhcp_relay container contains a single group name `isc-dhcp-relay`. When testing the autorestart feature of each container, we need get all the critical processes and test whether a container can be restarted correctly if one of its critical processes is killed. However, it will be difficult to differentiate whether the names in the critical_processes file are the critical processes or group names. At the same time, changing the syntax in this file will separate the individual process from the groups and also makes it clear to the user. Right now the critical_processes file contains two different kind of entries. One is "program:xxx" which indicates a critical process. Another is "group:xxx" which indicates a group of critical processes managed by supervisord using the name "xxx". At the same time, I also updated the logic to parse the file critical_processes in supervisor-proc-event-listener script. **- How to verify it** We can first enable the autorestart feature of a specified container for example `dhcp_relay` by running the comman `sudo config container feature autorestart dhcp_relay enabled` on DUT. Then we can select a critical process from the command `docker top dhcp_relay` and use the command `sudo kill -SIGKILL <pid>` to kill that critical process. Final step is to check whether the container is restarted correctly or not.
This commit is contained in:
parent
921d132a32
commit
4fa81b4f8d
@ -1 +1 @@
|
||||
redis
|
||||
program:redis
|
||||
|
@ -1 +1 @@
|
||||
isc-dhcp-relay
|
||||
group:isc-dhcp-relay
|
||||
|
@ -1,5 +1,5 @@
|
||||
zebra
|
||||
staticd
|
||||
bgpd
|
||||
fpmsyncd
|
||||
bgpcfgd
|
||||
program:zebra
|
||||
program:staticd
|
||||
program:bgpd
|
||||
program:fpmsyncd
|
||||
program:bgpcfgd
|
||||
|
@ -1,2 +1,2 @@
|
||||
gobgpd
|
||||
fpmsyncd
|
||||
program:gobgpd
|
||||
program:fpmsyncd
|
||||
|
@ -1,4 +1,4 @@
|
||||
zebra
|
||||
bgpd
|
||||
fpmsyncd
|
||||
bgpcfgd
|
||||
program:zebra
|
||||
program:bgpd
|
||||
program:fpmsyncd
|
||||
program:bgpcfgd
|
||||
|
@ -1,3 +1,3 @@
|
||||
lldpd
|
||||
lldp-syncd
|
||||
lldpmgrd
|
||||
program:lldpd
|
||||
program:lldp_syncd
|
||||
program:lldpmgrd
|
||||
|
@ -1,2 +1,2 @@
|
||||
natmgrd
|
||||
natsyncd
|
||||
program:natmgrd
|
||||
program:natsyncd
|
||||
|
@ -1,10 +1,10 @@
|
||||
orchagent
|
||||
portsyncd
|
||||
neighsyncd
|
||||
vlanmgrd
|
||||
intfmgrd
|
||||
portmgrd
|
||||
buffermgrd
|
||||
vrfmgrd
|
||||
nbrmgrd
|
||||
vxlanmgrd
|
||||
program:orchagent
|
||||
program:portsyncd
|
||||
program:neighsyncd
|
||||
program:vlanmgrd
|
||||
program:intfmgrd
|
||||
program:portmgrd
|
||||
program:buffermgrd
|
||||
program:vrfmgrd
|
||||
program:nbrmgrd
|
||||
program:vxlanmgrd
|
||||
|
@ -1,3 +1,3 @@
|
||||
ledd
|
||||
xcvrd
|
||||
psud
|
||||
program:ledd
|
||||
program:xcvrd
|
||||
program:psud
|
||||
|
@ -1 +1 @@
|
||||
radvd
|
||||
program:radvd
|
||||
|
@ -1 +1 @@
|
||||
sflowmgrd
|
||||
program:sflowmgrd
|
||||
|
@ -1,2 +1,2 @@
|
||||
snmpd
|
||||
snmp-subagent
|
||||
program:snmpd
|
||||
program:snmp-subagent
|
||||
|
@ -1 +1 @@
|
||||
restapi
|
||||
program:restapi
|
||||
|
@ -1,2 +1,2 @@
|
||||
telemetry
|
||||
dialout
|
||||
program:telemetry
|
||||
program:dialout
|
||||
|
@ -1,2 +1,2 @@
|
||||
teammgrd
|
||||
teamsyncd
|
||||
program:teammgrd
|
||||
program:teamsyncd
|
||||
|
@ -10,14 +10,42 @@ import swsssdk
|
||||
|
||||
from supervisor import childutils
|
||||
|
||||
# Contents of file should be the names of critical processes (as defined in
|
||||
# supervisor.conf file), one per line
|
||||
# Each line of this file should specify either one critical process or one
|
||||
# critical process group, (as defined in supervisord.conf file), in the
|
||||
# following format:
|
||||
#
|
||||
# program:<process_name>
|
||||
# group:<group_name>
|
||||
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
|
||||
|
||||
# This table in databse contains the features for container and each
|
||||
# feature for a row will be configured a state or number.
|
||||
CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE'
|
||||
|
||||
# Read the critical processes/group names from CRITICAL_PROCESSES_FILE
|
||||
def get_critical_group_and_process_list():
|
||||
critical_group_list = []
|
||||
critical_process_list = []
|
||||
|
||||
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
|
||||
for line in file:
|
||||
line_info = line.strip(' \n').split(':')
|
||||
if len(line_info) != 2:
|
||||
syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(5)
|
||||
|
||||
identifier_key = line_info[0].strip()
|
||||
identifier_value = line_info[1].strip()
|
||||
if identifier_key == "group" and identifier_value:
|
||||
critical_group_list.append(identifier_value)
|
||||
elif identifier_key == "program" and identifier_value:
|
||||
critical_process_list.append(identifier_value)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(6)
|
||||
|
||||
return critical_group_list, critical_process_list
|
||||
|
||||
def main(argv):
|
||||
container_name = None
|
||||
opts, args = getopt.getopt(argv, "c:", ["container-name="])
|
||||
@ -29,9 +57,7 @@ def main(argv):
|
||||
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
# Read the list of critical processes from a file
|
||||
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
|
||||
critical_processes = [line.rstrip('\n') for line in f]
|
||||
critical_group_list, critical_process_list = get_critical_group_and_process_list()
|
||||
|
||||
while True:
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
@ -73,7 +99,7 @@ def main(argv):
|
||||
# If container is database or auto-restart feature is enabled and at the same time
|
||||
# a critical process exited unexpectedly, terminate supervisor
|
||||
if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and
|
||||
(processname in critical_processes or groupname in critical_processes)):
|
||||
(processname in critical_process_list or groupname in critical_group_list)):
|
||||
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
|
||||
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
|
||||
syslog.syslog(syslog.LOG_INFO, msg)
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1,2 +1,2 @@
|
||||
dsserve
|
||||
syncd
|
||||
program:dsserve
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
@ -1,2 +1,2 @@
|
||||
dsserve
|
||||
syncd
|
||||
program:dsserve
|
||||
program:syncd
|
||||
|
@ -1 +1 @@
|
||||
syncd
|
||||
program:syncd
|
||||
|
Loading…
Reference in New Issue
Block a user