diff --git a/dockers/docker-orchagent/docker-init.j2 b/dockers/docker-orchagent/docker-init.j2 index bea9befc0b..46db99a316 100755 --- a/dockers/docker-orchagent/docker-init.j2 +++ b/dockers/docker-orchagent/docker-init.j2 @@ -18,7 +18,6 @@ CFGGEN_PARAMS=" \ -t /usr/share/sonic/templates/vlan_vars.j2 \ -t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \ -t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \ - -t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \ -t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf -t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \ " diff --git a/dockers/docker-orchagent/supervisord.conf.j2 b/dockers/docker-orchagent/supervisord.conf.j2 index 6c822f464a..eef040ee17 100644 --- a/dockers/docker-orchagent/supervisord.conf.j2 +++ b/dockers/docker-orchagent/supervisord.conf.j2 @@ -14,7 +14,7 @@ buffer_size=1024 [eventlistener:supervisor-proc-exit-listener] command=/usr/bin/supervisor-proc-exit-listener --container-name swss -events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT +events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING autostart=true autorestart=unexpected buffer_size=1024 @@ -75,7 +75,6 @@ command=/usr/bin/orchagent.sh priority=4 autostart=false autorestart=false -stdout_capture_maxbytes=1MB stdout_logfile=syslog stderr_logfile=syslog dependent_startup=true diff --git a/dockers/docker-orchagent/watchdog_processes.j2 b/dockers/docker-orchagent/watchdog_processes.j2 deleted file mode 100644 index bbe7c3a734..0000000000 --- a/dockers/docker-orchagent/watchdog_processes.j2 +++ /dev/null @@ -1 +0,0 @@ -program:orchagent \ No newline at end of file diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 9ba29466a9..dbfdaf2c5a 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -14,12 +14,6 @@ from swsscommon import swsscommon from supervisor import childutils -# Each line of this file should specify one process, (as defined in supervisord.conf file), in the -# following format: -# -# program: -WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes' - # Each line of this file should specify either one critical process or one # critical process group, (as defined in supervisord.conf file), in the # following format: @@ -40,15 +34,15 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" -def get_group_and_process_list(process_file): +def get_critical_group_and_process_list(): """ - @summary: Read the critical processes/group names. + @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE. @return: Two lists which contain critical processes and group names respectively. """ - group_list = [] - process_list = [] + critical_group_list = [] + critical_process_list = [] - with open(process_file, 'r') as file: + with open(CRITICAL_PROCESSES_FILE, 'r') as file: for line in file: # ignore blank lines if re.match(r"^\s*$", line): @@ -56,24 +50,24 @@ def get_group_and_process_list(process_file): line_info = line.strip(' \n').split(':') if len(line_info) != 2: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) sys.exit(5) identifier_key = line_info[0].strip() identifier_value = line_info[1].strip() if identifier_key == "group" and identifier_value: - group_list.append(identifier_value) + critical_group_list.append(identifier_value) elif identifier_key == "program" and identifier_value: - process_list.append(identifier_value) + critical_process_list.append(identifier_value) else: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) sys.exit(6) - return group_list, process_list + return critical_group_list, critical_process_list -def generate_alerting_message(process_name, status, dead_minutes): +def generate_alerting_message(process_name, dead_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host or in a specific namespace. Then an alerting message will be written into syslog. @@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." - .format(process_name, status, namespace, dead_minutes)) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)." + .format(process_name, namespace, dead_minutes)) def get_autorestart_state(container_name): @@ -131,11 +125,9 @@ def main(argv): syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") sys.exit(1) - critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) - _, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE) + critical_group_list, critical_process_list = get_critical_group_and_process_list() process_under_alerting = defaultdict(dict) - process_heart_beat_info = defaultdict(dict) # Transition from ACKNOWLEDGED to READY childutils.listener.ready() events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) @@ -175,15 +167,6 @@ def main(argv): if process_name in process_under_alerting: process_under_alerting.pop(process_name) - # Handle the PROCESS_COMMUNICATION_STDOUT event - elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - process_name = payload_headers['processname'] - - # update process heart beat time - if (process_name in watch_process_list): - process_heart_beat_info[process_name]["last_heart_beat"] = time.time() - # Transition from BUSY to ACKNOWLEDGED childutils.listener.ok() @@ -198,15 +181,7 @@ def main(argv): elapsed_mins = elapsed_secs // 60 process_under_alerting[process_name]["last_alerted"] = epoch_time process_under_alerting[process_name]["dead_minutes"] += elapsed_mins - generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) - - # Check whether we need write alerting messages into syslog - for process in process_heart_beat_info.keys(): - epoch_time = time.time() - elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 - generate_alerting_message(process, "stuck", elapsed_mins) + generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"]) if __name__ == "__main__": main(sys.argv[1:])