Add watchdog mechanism to swss service and generate alert when swss have issue. (#15429)
Add watchdog mechanism to swss service and generate alert when swss have issue. **Work item tracking** Microsoft ADO (number only): 16578912 **What I did** Add orchagent watchdog to monitor and alert orchagent stuck issue. **Why I did it** Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it. **How I verified it** Pass all UT. Manually test process_monitoring/test_critical_process_monitoring.py can pass. Add new UT https://github.com/sonic-net/sonic-mgmt/pull/8306 to check watchdog works correctly. Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log: Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes). **Details if related** Heartbeat message PR: https://github.com/sonic-net/sonic-swss/pull/2737 UT PR: https://github.com/sonic-net/sonic-mgmt/pull/8306
This commit is contained in:
parent
633fff8c10
commit
05f1a5a31e
@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \
|
||||
-t /usr/share/sonic/templates/vlan_vars.j2 \
|
||||
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
|
||||
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
|
||||
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
|
||||
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
|
||||
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
|
||||
"
|
||||
|
@ -14,7 +14,7 @@ buffer_size=1024
|
||||
|
||||
[eventlistener:supervisor-proc-exit-listener]
|
||||
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
|
||||
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
|
||||
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
|
||||
autostart=true
|
||||
autorestart=unexpected
|
||||
buffer_size=1024
|
||||
@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh
|
||||
priority=4
|
||||
autostart=false
|
||||
autorestart=false
|
||||
stdout_capture_maxbytes=1MB
|
||||
stdout_logfile=syslog
|
||||
stderr_logfile=syslog
|
||||
dependent_startup=true
|
||||
|
1
dockers/docker-orchagent/watchdog_processes.j2
Normal file
1
dockers/docker-orchagent/watchdog_processes.j2
Normal file
@ -0,0 +1 @@
|
||||
program:orchagent
|
@ -14,6 +14,12 @@ from swsscommon import swsscommon
|
||||
|
||||
from supervisor import childutils
|
||||
|
||||
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
|
||||
# following format:
|
||||
#
|
||||
# program:<process_name>
|
||||
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
|
||||
|
||||
# Each line of this file should specify either one critical process or one
|
||||
# critical process group, (as defined in supervisord.conf file), in the
|
||||
# following format:
|
||||
@ -34,15 +40,15 @@ ALERTING_INTERVAL_SECS = 60
|
||||
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
||||
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
|
||||
|
||||
def get_critical_group_and_process_list():
|
||||
def get_group_and_process_list(process_file):
|
||||
"""
|
||||
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
|
||||
@summary: Read the critical processes/group names.
|
||||
@return: Two lists which contain critical processes and group names respectively.
|
||||
"""
|
||||
critical_group_list = []
|
||||
critical_process_list = []
|
||||
group_list = []
|
||||
process_list = []
|
||||
|
||||
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
|
||||
with open(process_file, 'r') as file:
|
||||
for line in file:
|
||||
# ignore blank lines
|
||||
if re.match(r"^\s*$", line):
|
||||
@ -50,24 +56,24 @@ def get_critical_group_and_process_list():
|
||||
line_info = line.strip(' \n').split(':')
|
||||
if len(line_info) != 2:
|
||||
syslog.syslog(syslog.LOG_ERR,
|
||||
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(5)
|
||||
|
||||
identifier_key = line_info[0].strip()
|
||||
identifier_value = line_info[1].strip()
|
||||
if identifier_key == "group" and identifier_value:
|
||||
critical_group_list.append(identifier_value)
|
||||
group_list.append(identifier_value)
|
||||
elif identifier_key == "program" and identifier_value:
|
||||
critical_process_list.append(identifier_value)
|
||||
process_list.append(identifier_value)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR,
|
||||
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
|
||||
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
|
||||
sys.exit(6)
|
||||
|
||||
return critical_group_list, critical_process_list
|
||||
return group_list, process_list
|
||||
|
||||
|
||||
def generate_alerting_message(process_name, dead_minutes):
|
||||
def generate_alerting_message(process_name, status, dead_minutes):
|
||||
"""
|
||||
@summary: If a critical process was not running, this function will determine it resides in host
|
||||
or in a specific namespace. Then an alerting message will be written into syslog.
|
||||
@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
|
||||
else:
|
||||
namespace = namespace_prefix + namespace_id
|
||||
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
|
||||
.format(process_name, namespace, dead_minutes))
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
|
||||
.format(process_name, status, namespace, dead_minutes))
|
||||
|
||||
|
||||
def get_autorestart_state(container_name):
|
||||
@ -125,9 +131,15 @@ def main(argv):
|
||||
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
critical_group_list, critical_process_list = get_critical_group_and_process_list()
|
||||
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
|
||||
|
||||
# WATCH_PROCESSES_FILE is optional
|
||||
watch_process_list = []
|
||||
if os.path.exists(WATCH_PROCESSES_FILE):
|
||||
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
|
||||
|
||||
process_under_alerting = defaultdict(dict)
|
||||
process_heart_beat_info = defaultdict(dict)
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
childutils.listener.ready()
|
||||
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
||||
@ -167,6 +179,15 @@ def main(argv):
|
||||
if process_name in process_under_alerting:
|
||||
process_under_alerting.pop(process_name)
|
||||
|
||||
# Handle the PROCESS_COMMUNICATION_STDOUT event
|
||||
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
|
||||
payload_headers, payload_data = childutils.eventdata(payload + '\n')
|
||||
process_name = payload_headers['processname']
|
||||
|
||||
# update process heart beat time
|
||||
if (process_name in watch_process_list):
|
||||
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
|
||||
|
||||
# Transition from BUSY to ACKNOWLEDGED
|
||||
childutils.listener.ok()
|
||||
|
||||
@ -181,7 +202,15 @@ def main(argv):
|
||||
elapsed_mins = elapsed_secs // 60
|
||||
process_under_alerting[process_name]["last_alerted"] = epoch_time
|
||||
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
|
||||
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
|
||||
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
|
||||
|
||||
# Check whether we need write alerting messages into syslog
|
||||
for process in process_heart_beat_info.keys():
|
||||
epoch_time = time.time()
|
||||
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
|
||||
if elapsed_secs >= ALERTING_INTERVAL_SECS:
|
||||
elapsed_mins = elapsed_secs // 60
|
||||
generate_alerting_message(process, "stuck", elapsed_mins)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
||||
main(sys.argv[1:])
|
Loading…
Reference in New Issue
Block a user