Add watchdog mechanism to swss service and generate alert when swss have issue. (#15429)

Add watchdog mechanism to swss service and generate alert when swss have issue. 

**Work item tracking**
Microsoft ADO (number only): 16578912

**What I did**
Add orchagent watchdog to monitor and alert orchagent stuck issue.

**Why I did it**
Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it.

**How I verified it**
Pass all UT.

Manually test process_monitoring/test_critical_process_monitoring.py can pass.

Add new UT https://github.com/sonic-net/sonic-mgmt/pull/8306 to check watchdog works correctly.

Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log:

Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes).

**Details if related**
Heartbeat message PR: https://github.com/sonic-net/sonic-swss/pull/2737
UT PR: https://github.com/sonic-net/sonic-mgmt/pull/8306
This commit is contained in:
Hua Liu 2023-06-12 17:53:54 -07:00 committed by GitHub
parent 633fff8c10
commit 05f1a5a31e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 17 deletions

View File

@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \
-t /usr/share/sonic/templates/vlan_vars.j2 \
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
"

View File

@ -14,7 +14,7 @@ buffer_size=1024
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
autostart=true
autorestart=unexpected
buffer_size=1024
@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh
priority=4
autostart=false
autorestart=false
stdout_capture_maxbytes=1MB
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true

View File

@ -0,0 +1 @@
program:orchagent

View File

@ -14,6 +14,12 @@ from swsscommon import swsscommon
from supervisor import childutils
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
# Each line of this file should specify either one critical process or one
# critical process group, (as defined in supervisord.conf file), in the
# following format:
@ -34,15 +40,15 @@ ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
def get_critical_group_and_process_list():
def get_group_and_process_list(process_file):
"""
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
@summary: Read the critical processes/group names.
@return: Two lists which contain critical processes and group names respectively.
"""
critical_group_list = []
critical_process_list = []
group_list = []
process_list = []
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
with open(process_file, 'r') as file:
for line in file:
# ignore blank lines
if re.match(r"^\s*$", line):
@ -50,24 +56,24 @@ def get_critical_group_and_process_list():
line_info = line.strip(' \n').split(':')
if len(line_info) != 2:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(5)
identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip()
if identifier_key == "group" and identifier_value:
critical_group_list.append(identifier_value)
group_list.append(identifier_value)
elif identifier_key == "program" and identifier_value:
critical_process_list.append(identifier_value)
process_list.append(identifier_value)
else:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(6)
return critical_group_list, critical_process_list
return group_list, process_list
def generate_alerting_message(process_name, dead_minutes):
def generate_alerting_message(process_name, status, dead_minutes):
"""
@summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog.
@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
else:
namespace = namespace_prefix + namespace_id
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
.format(process_name, namespace, dead_minutes))
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
.format(process_name, status, namespace, dead_minutes))
def get_autorestart_state(container_name):
@ -125,9 +131,15 @@ def main(argv):
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
sys.exit(1)
critical_group_list, critical_process_list = get_critical_group_and_process_list()
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
# WATCH_PROCESSES_FILE is optional
watch_process_list = []
if os.path.exists(WATCH_PROCESSES_FILE):
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
process_under_alerting = defaultdict(dict)
process_heart_beat_info = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
@ -167,6 +179,15 @@ def main(argv):
if process_name in process_under_alerting:
process_under_alerting.pop(process_name)
# Handle the PROCESS_COMMUNICATION_STDOUT event
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']
# update process heart beat time
if (process_name in watch_process_list):
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
# Transition from BUSY to ACKNOWLEDGED
childutils.listener.ok()
@ -181,7 +202,15 @@ def main(argv):
elapsed_mins = elapsed_secs // 60
process_under_alerting[process_name]["last_alerted"] = epoch_time
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
# Check whether we need write alerting messages into syslog
for process in process_heart_beat_info.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
generate_alerting_message(process, "stuck", elapsed_mins)
if __name__ == "__main__":
main(sys.argv[1:])
main(sys.argv[1:])