From cec9d7b83a84f733602f06261cd20c8eb51d9b63 Mon Sep 17 00:00:00 2001 From: Ye Jianquan Date: Fri, 9 Jun 2023 09:10:35 +0800 Subject: [PATCH] Revert "Add watchdog mechanism to swss service and generate alert when swss have issue. (#14686)" (#15390) This reverts commit 44427a2f6b779471a68ffaabcc662778a694b414. Docker image not updated during PR validation and caused PR check failures. Force merge this revert. After cache is updated after this PR is merged, issue should be fixed. --- dockers/docker-orchagent/docker-init.j2 | 1 - dockers/docker-orchagent/supervisord.conf.j2 | 3 +- .../docker-orchagent/watchdog_processes.j2 | 1 - files/scripts/supervisor-proc-exit-listener | 55 +++++-------------- 4 files changed, 16 insertions(+), 44 deletions(-) delete mode 100644 dockers/docker-orchagent/watchdog_processes.j2 diff --git a/dockers/docker-orchagent/docker-init.j2 b/dockers/docker-orchagent/docker-init.j2 index bea9befc0b..46db99a316 100755 --- a/dockers/docker-orchagent/docker-init.j2 +++ b/dockers/docker-orchagent/docker-init.j2 @@ -18,7 +18,6 @@ CFGGEN_PARAMS=" \ -t /usr/share/sonic/templates/vlan_vars.j2 \ -t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \ -t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \ - -t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \ -t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf -t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \ " diff --git a/dockers/docker-orchagent/supervisord.conf.j2 b/dockers/docker-orchagent/supervisord.conf.j2 index 6c822f464a..eef040ee17 100644 --- a/dockers/docker-orchagent/supervisord.conf.j2 +++ b/dockers/docker-orchagent/supervisord.conf.j2 @@ -14,7 +14,7 @@ buffer_size=1024 [eventlistener:supervisor-proc-exit-listener] command=/usr/bin/supervisor-proc-exit-listener --container-name swss -events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT +events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING autostart=true autorestart=unexpected buffer_size=1024 @@ -75,7 +75,6 @@ command=/usr/bin/orchagent.sh priority=4 autostart=false autorestart=false -stdout_capture_maxbytes=1MB stdout_logfile=syslog stderr_logfile=syslog dependent_startup=true diff --git a/dockers/docker-orchagent/watchdog_processes.j2 b/dockers/docker-orchagent/watchdog_processes.j2 deleted file mode 100644 index bbe7c3a734..0000000000 --- a/dockers/docker-orchagent/watchdog_processes.j2 +++ /dev/null @@ -1 +0,0 @@ -program:orchagent \ No newline at end of file diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 9ba29466a9..dbfdaf2c5a 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -14,12 +14,6 @@ from swsscommon import swsscommon from supervisor import childutils -# Each line of this file should specify one process, (as defined in supervisord.conf file), in the -# following format: -# -# program: -WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes' - # Each line of this file should specify either one critical process or one # critical process group, (as defined in supervisord.conf file), in the # following format: @@ -40,15 +34,15 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" -def get_group_and_process_list(process_file): +def get_critical_group_and_process_list(): """ - @summary: Read the critical processes/group names. + @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE. @return: Two lists which contain critical processes and group names respectively. """ - group_list = [] - process_list = [] + critical_group_list = [] + critical_process_list = [] - with open(process_file, 'r') as file: + with open(CRITICAL_PROCESSES_FILE, 'r') as file: for line in file: # ignore blank lines if re.match(r"^\s*$", line): @@ -56,24 +50,24 @@ def get_group_and_process_list(process_file): line_info = line.strip(' \n').split(':') if len(line_info) != 2: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) sys.exit(5) identifier_key = line_info[0].strip() identifier_value = line_info[1].strip() if identifier_key == "group" and identifier_value: - group_list.append(identifier_value) + critical_group_list.append(identifier_value) elif identifier_key == "program" and identifier_value: - process_list.append(identifier_value) + critical_process_list.append(identifier_value) else: syslog.syslog(syslog.LOG_ERR, - "Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) + "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) sys.exit(6) - return group_list, process_list + return critical_group_list, critical_process_list -def generate_alerting_message(process_name, status, dead_minutes): +def generate_alerting_message(process_name, dead_minutes): """ @summary: If a critical process was not running, this function will determine it resides in host or in a specific namespace. Then an alerting message will be written into syslog. @@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes): else: namespace = namespace_prefix + namespace_id - syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." - .format(process_name, status, namespace, dead_minutes)) + syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)." + .format(process_name, namespace, dead_minutes)) def get_autorestart_state(container_name): @@ -131,11 +125,9 @@ def main(argv): syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") sys.exit(1) - critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) - _, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE) + critical_group_list, critical_process_list = get_critical_group_and_process_list() process_under_alerting = defaultdict(dict) - process_heart_beat_info = defaultdict(dict) # Transition from ACKNOWLEDGED to READY childutils.listener.ready() events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) @@ -175,15 +167,6 @@ def main(argv): if process_name in process_under_alerting: process_under_alerting.pop(process_name) - # Handle the PROCESS_COMMUNICATION_STDOUT event - elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT': - payload_headers, payload_data = childutils.eventdata(payload + '\n') - process_name = payload_headers['processname'] - - # update process heart beat time - if (process_name in watch_process_list): - process_heart_beat_info[process_name]["last_heart_beat"] = time.time() - # Transition from BUSY to ACKNOWLEDGED childutils.listener.ok() @@ -198,15 +181,7 @@ def main(argv): elapsed_mins = elapsed_secs // 60 process_under_alerting[process_name]["last_alerted"] = epoch_time process_under_alerting[process_name]["dead_minutes"] += elapsed_mins - generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) - - # Check whether we need write alerting messages into syslog - for process in process_heart_beat_info.keys(): - epoch_time = time.time() - elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 - generate_alerting_message(process, "stuck", elapsed_mins) + generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"]) if __name__ == "__main__": main(sys.argv[1:])