Publish additional events (#12563)

Add event_publish code or regex for rsyslog plugin for additional events
This commit is contained in:
Zain Budhwani 2022-11-07 09:57:57 -08:00 committed by GitHub
parent e2b3bdf72a
commit 8f48773fd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 6 deletions

View File

@ -3,5 +3,10 @@
"tag": "dhcp-relay-discard",
"regex": "Discarding packet received on ([a-zA-Z0-9-_]*) interface that has no IPv4 address assigned.",
"params": [ "ifname" ]
},
{
"tag": "dhcp-relay-bind-failure",
"regex": "Failed to bind socket to (link local|global) ipv6 address on interface ([a-zA-Z0-9]*)",
"params": [ "type:ret=(arg==\"link local\")and\"local\"or\"global\")", "vlan" ]
}
]

View File

@ -27,6 +27,10 @@ import re
import docker
from swsscommon import swsscommon
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded"
def get_command_result(command):
"""Executes the command and return the resulting output.
@ -54,8 +58,14 @@ def get_command_result(command):
return command_stdout.strip()
def publish_events(events_handle, container_name, mem_usage_bytes, threshold_value):
params = swsscommon.FieldValueMap()
params["ctr_name"] = container_name
params["mem_usage"] = mem_usage_bytes
params["threshold"] = threshold_value
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
def check_memory_usage(container_name, threshold_value):
def check_memory_usage(events_handle, container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
@ -89,6 +99,8 @@ def check_memory_usage(container_name, threshold_value):
.format(container_name, mem_usage_bytes, threshold_value))
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
# publish event
publish_events(events_handle, container_name, mem_usage_bytes, threshold_value)
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
@ -148,13 +160,14 @@ def main():
sys.exit(0)
running_container_names = get_running_container_names()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
if args.container_name in running_container_names:
check_memory_usage(args.container_name, args.threshold_value)
check_memory_usage(events_handle, args.container_name, args.threshold_value)
else:
syslog.syslog(syslog.LOG_INFO,
"[memory_checker] Exits without checking memory usage since container '{}' is not running!"
.format(args.container_name))
swsscommon.events_deinit_publisher(events_handle)
if __name__ == "__main__":
main()

View File

@ -31,6 +31,8 @@ SELECT_TIMEOUT_SECS = 1.0
# Alerting message will be written into syslog in the following interval
ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
def get_critical_group_and_process_list():
"""
@ -106,6 +108,11 @@ def get_autorestart_state(container_name):
return is_auto_restart
def publish_events(events_handle, process_name, container_name):
params = swsscommon.FieldValueMap()
params["process_name"] = process_name
params["ctr_name"] = container_name
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
def main(argv):
container_name = None
@ -123,7 +130,7 @@ def main(argv):
process_under_alerting = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
while True:
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
if len(file_descriptor_list) > 0:
@ -145,6 +152,8 @@ def main(argv):
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
syslog.syslog(syslog.LOG_INFO, msg)
publish_events(events_handle, payload_headers['processname'], container_name)
swsscommon.events_deinit_publisher(events_handle)
os.kill(os.getppid(), signal.SIGTERM)
else:
process_under_alerting[process_name]["last_alerted"] = time.time()
@ -174,6 +183,5 @@ def main(argv):
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -12,6 +12,8 @@ from . import utils
SYSLOG_IDENTIFIER = 'service_checker'
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-not-running"
class ServiceChecker(HealthChecker):
"""
@ -55,6 +57,8 @@ class ServiceChecker(HealthChecker):
self.load_critical_process_cache()
self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
def get_expected_running_containers(self, feature_table):
"""Get a set of containers that are expected to running on SONiC
@ -288,7 +292,7 @@ class ServiceChecker(HealthChecker):
self.reset()
self.check_by_monit(config)
self.check_services(config)
swsscommon.events_deinit_publisher(self.events_handle)
def _parse_supervisorctl_status(self, process_status):
"""Expected input:
@ -309,6 +313,13 @@ class ServiceChecker(HealthChecker):
data[items[0].strip()] = items[1].strip()
return data
def publish_events(self, container_name, critical_process_list):
params = swsscommon.FieldValueMap()
params["ctr_name"] = container_name
for process_name in critical_process_list:
params["process_name"] = process_name
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
"""Check whether the process in the specified container is running or not.
@ -333,6 +344,7 @@ class ServiceChecker(HealthChecker):
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.publish_events(container_name, critical_process_list)
return
process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())