[monit] Adding patch to enhance syslog error message generation for monit alert action when status is failed. (#5720)

Why/How I did:

Make sure first error syslog is triggered based on FAULT TOLERANCE condition.

Added support of repeat clause with alert action. This is used as trigger
for generation of periodic syslog error messages if error is persistent

Updated the monit conf files with repeat every x cycles for the alert action
This commit is contained in:
abdosi 2020-10-31 17:29:49 -07:00 committed by GitHub
parent 8d8aadb615
commit dddf96933c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 111 additions and 45 deletions

View File

@ -4,4 +4,4 @@
## redis_server
###############################################################################
check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -9,19 +9,19 @@
## bgpmon
###############################################################################
check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program bgp|bgpmon with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpmon"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -6,10 +6,10 @@
## lldpmgrd
###############################################################################
check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -13,31 +13,31 @@
## vxlanmgrd
##############################################################################
check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## sflowmgrd
###############################################################################
check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -5,7 +5,7 @@
## snmpd_subagent
###############################################################################
check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3 -m sonic_ax_impl"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## restapi
###############################################################################
check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -5,7 +5,7 @@
## dialout_client
###############################################################################
check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -5,7 +5,7 @@
## teammgrd
###############################################################################
check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -6,15 +6,15 @@
###############################################################################
check filesystem root-overlay with path /
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
check filesystem var-log with path /var/log
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
check system $HOST
if memory usage > 90% for 10 times within 20 cycles then alert
if cpu usage (user) > 90% for 10 times within 20 cycles then alert
if cpu usage (system) > 90% for 10 times within 20 cycles then alert
if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
check process rsyslog with pidfile /var/run/rsyslogd.pid
start program = "/bin/systemctl start rsyslog.service"
@ -29,4 +29,5 @@ check process rsyslog with pidfile /var/run/rsyslogd.pid
#
check program routeCheck with path "/usr/local/bin/route_check.py"
every 5 cycles
if status != 0 then alert
if status != 0 for 3 cycle then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -5,7 +5,7 @@
## dsserve
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -5,7 +5,7 @@
## dsserve
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

View File

@ -0,0 +1,64 @@
From 97a5defc6a7fcc6a00f691bb5314ceb8fb7704e9 Mon Sep 17 00:00:00 2001
From: Abhishek Dosi <abdosi@microsoft.com>
Date: Mon, 26 Oct 2020 11:40:02 -0700
Subject: [PATCH] Patch on top of commit Patch is addressing these changes:-
a) Enable repeat keyword for alert action . Using this we can log
syslog error message for persistent failure condition
b) Make sure error message is loggged if state is changed to fail first time (fault tolerance condition)
or we have repeat clause for alert
Signed-off-by: Abhishek Dosi <abdosi@microsoft.com>
---
src/event.c | 6 +++++-
src/p.y | 8 +++++++-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/event.c b/src/event.c
index ed363ee..9d08fc0 100644
--- a/src/event.c
+++ b/src/event.c
@@ -336,7 +336,8 @@ static void _handleEvent(Service_T S, Event_T E) {
if (E->state != State_Init || E->state_map & 0x1) {
if (E->state == State_Succeeded || E->state == State_ChangedNot || E->id == Event_Instance || E->id == Event_Action)
LogInfo("'%s' %s\n", S->name, E->message);
- else
+ /* Send Error log if state change to failed for 1st time or if we have repeat clause then do periodically */
+ else if ((E->state_changed) || (E->state == State_Failed && E->action->failed->repeat && E->count % E->action->failed->repeat == 0))
LogError("'%s' %s\n", S->name, E->message);
}
if (E->state == State_Init)
return;
diff --git a/src/p.y b/src/p.y
index a57807d..b46b1a1 100644
--- a/src/p.y
+++ b/src/p.y
@@ -2250,9 +2250,12 @@ repeat : /* EMPTY */ {
}
;
-action : ALERT {
+action : ALERT repeat{
$<number>$ = Action_Alert;
}
+ | ALERT {
+ $<number>$ = Action_Alert;
+ }
| EXEC argumentlist repeat {
$<number>$ = Action_Exec;
}
@@ -2281,6 +2284,9 @@ action1 : action {
repeat = 0;
command1 = command;
command = NULL;
+ } else if ($<number>1 == Action_Alert) {
+ repeat1 = repeat;
+ repeat = 0;
}
}
;
--
2.17.1

View File

@ -1,2 +1,3 @@
# This series applies on GIT commit dc9bc1c949125140d967edfc598dfad47eedc552
0001-used_system_memory_sysdep-Use-MemAvailable-value-if-.patch
0002-change_monit_alert_log_error.patch