Revert "Add watchdog mechanism to swss service and generate alert when swss have issue. (#14686)" (#15390)

This reverts commit 44427a2f6b.
Docker image not updated during PR validation and caused PR check failures.
Force merge this revert. After cache is updated after this PR is merged, issue should be fixed.
This commit is contained in:
Ye Jianquan 2023-06-09 09:10:35 +08:00 committed by GitHub
parent 0f194c5a03
commit cec9d7b83a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 16 additions and 44 deletions

View File

@ -18,7 +18,6 @@ CFGGEN_PARAMS=" \
-t /usr/share/sonic/templates/vlan_vars.j2 \ -t /usr/share/sonic/templates/vlan_vars.j2 \
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \ -t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \ -t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf -t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \ -t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
" "

View File

@ -14,7 +14,7 @@ buffer_size=1024
[eventlistener:supervisor-proc-exit-listener] [eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name swss command=/usr/bin/supervisor-proc-exit-listener --container-name swss
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true autostart=true
autorestart=unexpected autorestart=unexpected
buffer_size=1024 buffer_size=1024
@ -75,7 +75,6 @@ command=/usr/bin/orchagent.sh
priority=4 priority=4
autostart=false autostart=false
autorestart=false autorestart=false
stdout_capture_maxbytes=1MB
stdout_logfile=syslog stdout_logfile=syslog
stderr_logfile=syslog stderr_logfile=syslog
dependent_startup=true dependent_startup=true

View File

@ -1 +0,0 @@
program:orchagent

View File

@ -14,12 +14,6 @@ from swsscommon import swsscommon
from supervisor import childutils from supervisor import childutils
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
# Each line of this file should specify either one critical process or one # Each line of this file should specify either one critical process or one
# critical process group, (as defined in supervisord.conf file), in the # critical process group, (as defined in supervisord.conf file), in the
# following format: # following format:
@ -40,15 +34,15 @@ ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
def get_group_and_process_list(process_file): def get_critical_group_and_process_list():
""" """
@summary: Read the critical processes/group names. @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
@return: Two lists which contain critical processes and group names respectively. @return: Two lists which contain critical processes and group names respectively.
""" """
group_list = [] critical_group_list = []
process_list = [] critical_process_list = []
with open(process_file, 'r') as file: with open(CRITICAL_PROCESSES_FILE, 'r') as file:
for line in file: for line in file:
# ignore blank lines # ignore blank lines
if re.match(r"^\s*$", line): if re.match(r"^\s*$", line):
@ -56,24 +50,24 @@ def get_group_and_process_list(process_file):
line_info = line.strip(' \n').split(':') line_info = line.strip(' \n').split(':')
if len(line_info) != 2: if len(line_info) != 2:
syslog.syslog(syslog.LOG_ERR, syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
sys.exit(5) sys.exit(5)
identifier_key = line_info[0].strip() identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip() identifier_value = line_info[1].strip()
if identifier_key == "group" and identifier_value: if identifier_key == "group" and identifier_value:
group_list.append(identifier_value) critical_group_list.append(identifier_value)
elif identifier_key == "program" and identifier_value: elif identifier_key == "program" and identifier_value:
process_list.append(identifier_value) critical_process_list.append(identifier_value)
else: else:
syslog.syslog(syslog.LOG_ERR, syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line)) "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
sys.exit(6) sys.exit(6)
return group_list, process_list return critical_group_list, critical_process_list
def generate_alerting_message(process_name, status, dead_minutes): def generate_alerting_message(process_name, dead_minutes):
""" """
@summary: If a critical process was not running, this function will determine it resides in host @summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog. or in a specific namespace. Then an alerting message will be written into syslog.
@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes):
else: else:
namespace = namespace_prefix + namespace_id namespace = namespace_prefix + namespace_id
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)." syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
.format(process_name, status, namespace, dead_minutes)) .format(process_name, namespace, dead_minutes))
def get_autorestart_state(container_name): def get_autorestart_state(container_name):
@ -131,11 +125,9 @@ def main(argv):
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
sys.exit(1) sys.exit(1)
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) critical_group_list, critical_process_list = get_critical_group_and_process_list()
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
process_under_alerting = defaultdict(dict) process_under_alerting = defaultdict(dict)
process_heart_beat_info = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY # Transition from ACKNOWLEDGED to READY
childutils.listener.ready() childutils.listener.ready()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
@ -175,15 +167,6 @@ def main(argv):
if process_name in process_under_alerting: if process_name in process_under_alerting:
process_under_alerting.pop(process_name) process_under_alerting.pop(process_name)
# Handle the PROCESS_COMMUNICATION_STDOUT event
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']
# update process heart beat time
if (process_name in watch_process_list):
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
# Transition from BUSY to ACKNOWLEDGED # Transition from BUSY to ACKNOWLEDGED
childutils.listener.ok() childutils.listener.ok()
@ -198,15 +181,7 @@ def main(argv):
elapsed_mins = elapsed_secs // 60 elapsed_mins = elapsed_secs // 60
process_under_alerting[process_name]["last_alerted"] = epoch_time process_under_alerting[process_name]["last_alerted"] = epoch_time
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"]) generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
# Check whether we need write alerting messages into syslog
for process in process_heart_beat_info.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
generate_alerting_message(process, "stuck", elapsed_mins)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main(sys.argv[1:])