Revert "Add watchdog mechanism to swss service and generate alert when swss have issue. (#14686)" (#15390)
This reverts commit 44427a2f6b
.
Docker image not updated during PR validation and caused PR check failures.
Force merge this revert. After cache is updated after this PR is merged, issue should be fixed.
This commit is contained in:
parent
0f194c5a03
commit
cec9d7b83a
@ -18,7 +18,6 @@ CFGGEN_PARAMS=" \
|
||||
-t /usr/share/sonic/templates/vlan_vars.j2 \
|
||||
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
|
||||
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
|
||||
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
|
||||
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
|
||||
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
|
||||
"
|
||||
|
@ -14,7 +14,7 @@ buffer_size=1024
|
||||
|
||||
[eventlistener:supervisor-proc-exit-listener]
|
||||
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
|
||||
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
|
||||
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
|
||||
autostart=true
|
||||
autorestart=unexpected
|
||||
buffer_size=1024
|
||||
@ -75,7 +75,6 @@ command=/usr/bin/orchagent.sh
|
||||
priority=4
|
||||
autostart=false
|
||||
autorestart=false
|
||||
stdout_capture_maxbytes=1MB
|
||||
stdout_logfile=syslog
|
||||
stderr_logfile=syslog
|
||||
dependent_startup=true
|
||||
|
@ -1 +0,0 @@
|
||||
program:orchagent
|
@ -14,12 +14,6 @@ from swsscommon import swsscommon
|
||||
|
||||
from supervisor import childutils
|
||||
|
||||
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
|
||||
# following format:
|
||||
#
|
||||
# program:<process_name>
|
||||
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
|
||||
|
||||
# Each line of this file should specify either one critical process or one
|
||||
# critical process group, (as defined in supervisord.conf file), in the
|
||||
# following format:
|
||||
@ -40,15 +34,15 @@ ALERTING_INTERVAL_SECS = 60
|
||||
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
||||
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
|
||||
|
||||
def get_group_and_process_list(process_file):
|
||||
def get_critical_group_and_process_list():
|
||||
"""
|
||||
@summary: Read the critical processes/group names.
|
||||
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
|
||||
@return: Two lists which contain critical processes and group names respectively.
|
||||
"""
|
||||
group_list = []
|
||||
process_list = []
|
||||
critical_group_list = []
|
||||
critical_process_list = []
|
||||
|
||||
with open(process_file, 'r') as file:
|
||||
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
|
||||
for line in file:
|
||||
# ignore blank lines
|
||||
if re.match(r"^\s*$", line):
|
||||
@ -56,24 +50,24 @@ def get_group_and_process_list(process_file):
|
||||
line_info = line.strip(' \n').split(':')
|
||||
if len(line_info) != 2:
|
||||
syslog.syslog(syslog.LOG_ERR,
|
||||
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
|
||||
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(5)
|
||||
|
||||
identifier_key = line_info[0].strip()
|
||||
identifier_value = line_info[1].strip()
|
||||
if identifier_key == "group" and identifier_value:
|
||||
group_list.append(identifier_value)
|
||||
critical_group_list.append(identifier_value)
|
||||
elif identifier_key == "program" and identifier_value:
|
||||
process_list.append(identifier_value)
|
||||
critical_process_list.append(identifier_value)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR,
|
||||
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
|
||||
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(6)
|
||||
|
||||
return group_list, process_list
|
||||
return critical_group_list, critical_process_list
|
||||
|
||||
|
||||
def generate_alerting_message(process_name, status, dead_minutes):
|
||||
def generate_alerting_message(process_name, dead_minutes):
|
||||
"""
|
||||
@summary: If a critical process was not running, this function will determine it resides in host
|
||||
or in a specific namespace. Then an alerting message will be written into syslog.
|
||||
@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes):
|
||||
else:
|
||||
namespace = namespace_prefix + namespace_id
|
||||
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
|
||||
.format(process_name, status, namespace, dead_minutes))
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
|
||||
.format(process_name, namespace, dead_minutes))
|
||||
|
||||
|
||||
def get_autorestart_state(container_name):
|
||||
@ -131,11 +125,9 @@ def main(argv):
|
||||
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
|
||||
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
|
||||
critical_group_list, critical_process_list = get_critical_group_and_process_list()
|
||||
|
||||
process_under_alerting = defaultdict(dict)
|
||||
process_heart_beat_info = defaultdict(dict)
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
childutils.listener.ready()
|
||||
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
||||
@ -175,15 +167,6 @@ def main(argv):
|
||||
if process_name in process_under_alerting:
|
||||
process_under_alerting.pop(process_name)
|
||||
|
||||
# Handle the PROCESS_COMMUNICATION_STDOUT event
|
||||
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
|
||||
payload_headers, payload_data = childutils.eventdata(payload + '\n')
|
||||
process_name = payload_headers['processname']
|
||||
|
||||
# update process heart beat time
|
||||
if (process_name in watch_process_list):
|
||||
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
|
||||
|
||||
# Transition from BUSY to ACKNOWLEDGED
|
||||
childutils.listener.ok()
|
||||
|
||||
@ -198,15 +181,7 @@ def main(argv):
|
||||
elapsed_mins = elapsed_secs // 60
|
||||
process_under_alerting[process_name]["last_alerted"] = epoch_time
|
||||
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
|
||||
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
|
||||
|
||||
# Check whether we need write alerting messages into syslog
|
||||
for process in process_heart_beat_info.keys():
|
||||
epoch_time = time.time()
|
||||
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
|
||||
if elapsed_secs >= ALERTING_INTERVAL_SECS:
|
||||
elapsed_mins = elapsed_secs // 60
|
||||
generate_alerting_message(process, "stuck", elapsed_mins)
|
||||
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
||||
|
Reference in New Issue
Block a user