[Supervisord] Deduplicate the alerting messages of critical processes from Supervisord. (#6849)
Signed-off-by: Yong Zhao yozhao@microsoft.com Why I did it In the configuration of rsyslog, duplicate messages will be suppressed and reported in the format of message repeated n times. Due to this behavior, if a critical process in a container exited unexpectedly, the alerting message will be written into syslog once and not be written into syslog anymore until the second critical process exited. This PR aims to differentiate these alerting messages such that they will not be suppressed by rsyslogd and can appear in the syslog periodically. How I did it This PR adds a counter into the alerting message and shows how many minutes a critical process was not running. How to verify it I verified and test this implementation on a physical DUT.
This commit is contained in:
parent
c6178259fc
commit
21f5e1280d
@ -8,6 +8,7 @@ import signal
|
||||
import sys
|
||||
import syslog
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import swsssdk
|
||||
|
||||
@ -64,7 +65,7 @@ def get_critical_group_and_process_list():
|
||||
return critical_group_list, critical_process_list
|
||||
|
||||
|
||||
def generate_alerting_message(process_name):
|
||||
def generate_alerting_message(process_name, dead_minutes):
|
||||
"""
|
||||
@summary: If a critical process was not running, this function will determine it resides in host
|
||||
or in a specific namespace. Then an alerting message will be written into syslog.
|
||||
@ -77,7 +78,8 @@ def generate_alerting_message(process_name):
|
||||
else:
|
||||
namespace = namespace_prefix + namespace_id
|
||||
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'.".format(process_name, namespace))
|
||||
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
|
||||
.format(process_name, namespace, dead_minutes))
|
||||
|
||||
|
||||
def get_autorestart_state(container_name):
|
||||
@ -118,7 +120,7 @@ def main(argv):
|
||||
|
||||
critical_group_list, critical_process_list = get_critical_group_and_process_list()
|
||||
|
||||
process_under_alerting = {}
|
||||
process_under_alerting = defaultdict(dict)
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
childutils.listener.ready()
|
||||
|
||||
@ -145,7 +147,8 @@ def main(argv):
|
||||
syslog.syslog(syslog.LOG_INFO, msg)
|
||||
os.kill(os.getppid(), signal.SIGTERM)
|
||||
else:
|
||||
process_under_alerting[process_name] = time.time()
|
||||
process_under_alerting[process_name]["last_alerted"] = time.time()
|
||||
process_under_alerting[process_name]["dead_minutes"] = 0
|
||||
|
||||
# Handle the PROCESS_STATE_RUNNING event
|
||||
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
|
||||
@ -162,11 +165,14 @@ def main(argv):
|
||||
childutils.listener.ready()
|
||||
|
||||
# Check whether we need write alerting messages into syslog
|
||||
for process in process_under_alerting.keys():
|
||||
for process_name in process_under_alerting.keys():
|
||||
epoch_time = time.time()
|
||||
if epoch_time - process_under_alerting[process] >= ALERTING_INTERVAL_SECS:
|
||||
process_under_alerting[process] = epoch_time
|
||||
generate_alerting_message(process)
|
||||
elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"]
|
||||
if elapsed_secs >= ALERTING_INTERVAL_SECS:
|
||||
elapsed_mins = elapsed_secs // 60
|
||||
process_under_alerting[process_name]["last_alerted"] = epoch_time
|
||||
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
|
||||
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user