Publish additional events (#12563)
Add event_publish code or regex for rsyslog plugin for additional events
This commit is contained in:
parent
e2b3bdf72a
commit
8f48773fd1
@ -3,5 +3,10 @@
|
||||
"tag": "dhcp-relay-discard",
|
||||
"regex": "Discarding packet received on ([a-zA-Z0-9-_]*) interface that has no IPv4 address assigned.",
|
||||
"params": [ "ifname" ]
|
||||
},
|
||||
{
|
||||
"tag": "dhcp-relay-bind-failure",
|
||||
"regex": "Failed to bind socket to (link local|global) ipv6 address on interface ([a-zA-Z0-9]*)",
|
||||
"params": [ "type:ret=(arg==\"link local\")and\"local\"or\"global\")", "vlan" ]
|
||||
}
|
||||
]
|
||||
|
@ -27,6 +27,10 @@ import re
|
||||
|
||||
import docker
|
||||
|
||||
from swsscommon import swsscommon
|
||||
|
||||
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
||||
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded"
|
||||
|
||||
def get_command_result(command):
|
||||
"""Executes the command and return the resulting output.
|
||||
@ -54,8 +58,14 @@ def get_command_result(command):
|
||||
|
||||
return command_stdout.strip()
|
||||
|
||||
def publish_events(events_handle, container_name, mem_usage_bytes, threshold_value):
|
||||
params = swsscommon.FieldValueMap()
|
||||
params["ctr_name"] = container_name
|
||||
params["mem_usage"] = mem_usage_bytes
|
||||
params["threshold"] = threshold_value
|
||||
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
|
||||
|
||||
def check_memory_usage(container_name, threshold_value):
|
||||
def check_memory_usage(events_handle, container_name, threshold_value):
|
||||
"""Checks the memory usage of a container and writes an alerting messages into
|
||||
the syslog if the memory usage is larger than the threshold value.
|
||||
|
||||
@ -89,6 +99,8 @@ def check_memory_usage(container_name, threshold_value):
|
||||
.format(container_name, mem_usage_bytes, threshold_value))
|
||||
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
||||
.format(container_name, mem_usage_bytes, threshold_value))
|
||||
# publish event
|
||||
publish_events(events_handle, container_name, mem_usage_bytes, threshold_value)
|
||||
sys.exit(3)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
|
||||
@ -148,13 +160,14 @@ def main():
|
||||
sys.exit(0)
|
||||
|
||||
running_container_names = get_running_container_names()
|
||||
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
||||
if args.container_name in running_container_names:
|
||||
check_memory_usage(args.container_name, args.threshold_value)
|
||||
check_memory_usage(events_handle, args.container_name, args.threshold_value)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_INFO,
|
||||
"[memory_checker] Exits without checking memory usage since container '{}' is not running!"
|
||||
.format(args.container_name))
|
||||
|
||||
swsscommon.events_deinit_publisher(events_handle)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -31,6 +31,8 @@ SELECT_TIMEOUT_SECS = 1.0
|
||||
# Alerting message will be written into syslog in the following interval
|
||||
ALERTING_INTERVAL_SECS = 60
|
||||
|
||||
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
||||
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
|
||||
|
||||
def get_critical_group_and_process_list():
|
||||
"""
|
||||
@ -106,6 +108,11 @@ def get_autorestart_state(container_name):
|
||||
|
||||
return is_auto_restart
|
||||
|
||||
def publish_events(events_handle, process_name, container_name):
|
||||
params = swsscommon.FieldValueMap()
|
||||
params["process_name"] = process_name
|
||||
params["ctr_name"] = container_name
|
||||
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
|
||||
|
||||
def main(argv):
|
||||
container_name = None
|
||||
@ -123,7 +130,7 @@ def main(argv):
|
||||
process_under_alerting = defaultdict(dict)
|
||||
# Transition from ACKNOWLEDGED to READY
|
||||
childutils.listener.ready()
|
||||
|
||||
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
||||
while True:
|
||||
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
|
||||
if len(file_descriptor_list) > 0:
|
||||
@ -145,6 +152,8 @@ def main(argv):
|
||||
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
|
||||
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
|
||||
syslog.syslog(syslog.LOG_INFO, msg)
|
||||
publish_events(events_handle, payload_headers['processname'], container_name)
|
||||
swsscommon.events_deinit_publisher(events_handle)
|
||||
os.kill(os.getppid(), signal.SIGTERM)
|
||||
else:
|
||||
process_under_alerting[process_name]["last_alerted"] = time.time()
|
||||
@ -174,6 +183,5 @@ def main(argv):
|
||||
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
|
||||
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
||||
|
@ -12,6 +12,8 @@ from . import utils
|
||||
SYSLOG_IDENTIFIER = 'service_checker'
|
||||
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
|
||||
|
||||
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
||||
EVENTS_PUBLISHER_TAG = "process-not-running"
|
||||
|
||||
class ServiceChecker(HealthChecker):
|
||||
"""
|
||||
@ -55,6 +57,8 @@ class ServiceChecker(HealthChecker):
|
||||
|
||||
self.load_critical_process_cache()
|
||||
|
||||
self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
||||
|
||||
def get_expected_running_containers(self, feature_table):
|
||||
"""Get a set of containers that are expected to running on SONiC
|
||||
|
||||
@ -288,7 +292,7 @@ class ServiceChecker(HealthChecker):
|
||||
self.reset()
|
||||
self.check_by_monit(config)
|
||||
self.check_services(config)
|
||||
|
||||
swsscommon.events_deinit_publisher(self.events_handle)
|
||||
|
||||
def _parse_supervisorctl_status(self, process_status):
|
||||
"""Expected input:
|
||||
@ -309,6 +313,13 @@ class ServiceChecker(HealthChecker):
|
||||
data[items[0].strip()] = items[1].strip()
|
||||
return data
|
||||
|
||||
def publish_events(self, container_name, critical_process_list):
|
||||
params = swsscommon.FieldValueMap()
|
||||
params["ctr_name"] = container_name
|
||||
for process_name in critical_process_list:
|
||||
params["process_name"] = process_name
|
||||
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
|
||||
|
||||
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
|
||||
"""Check whether the process in the specified container is running or not.
|
||||
|
||||
@ -333,6 +344,7 @@ class ServiceChecker(HealthChecker):
|
||||
if process_status is None:
|
||||
for process_name in critical_process_list:
|
||||
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
|
||||
self.publish_events(container_name, critical_process_list)
|
||||
return
|
||||
|
||||
process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
|
||||
|
Loading…
Reference in New Issue
Block a user