[monit] Adding patch to enhance syslog error message generation for monit alert action when status is failed. (#5720)
Why/How I did: Make sure first error syslog is triggered based on FAULT TOLERANCE condition. Added support of repeat clause with alert action. This is used as trigger for generation of periodic syslog error messages if error is persistent Updated the monit conf files with repeat every x cycles for the alert action
This commit is contained in:
parent
28366cd0ce
commit
0fad6bdc7f
@ -4,4 +4,4 @@
|
||||
## redis_server
|
||||
###############################################################################
|
||||
check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -9,19 +9,19 @@
|
||||
## bgpmon
|
||||
###############################################################################
|
||||
check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program bgp|bgpmon with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpmon"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -6,10 +6,10 @@
|
||||
## lldpmgrd
|
||||
###############################################################################
|
||||
check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -13,31 +13,31 @@
|
||||
## vxlanmgrd
|
||||
##############################################################################
|
||||
check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## sflowmgrd
|
||||
###############################################################################
|
||||
check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -5,7 +5,7 @@
|
||||
## snmpd_subagent
|
||||
###############################################################################
|
||||
check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3.6 -m sonic_ax_impl"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## restapi
|
||||
###############################################################################
|
||||
check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -5,7 +5,7 @@
|
||||
## dialout_client
|
||||
###############################################################################
|
||||
check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -5,7 +5,7 @@
|
||||
## teammgrd
|
||||
###############################################################################
|
||||
check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -6,15 +6,15 @@
|
||||
###############################################################################
|
||||
|
||||
check filesystem root-overlay with path /
|
||||
if space usage > 90% for 10 times within 20 cycles then alert
|
||||
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
|
||||
|
||||
check filesystem var-log with path /var/log
|
||||
if space usage > 90% for 10 times within 20 cycles then alert
|
||||
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
|
||||
|
||||
check system $HOST
|
||||
if memory usage > 90% for 10 times within 20 cycles then alert
|
||||
if cpu usage (user) > 90% for 10 times within 20 cycles then alert
|
||||
if cpu usage (system) > 90% for 10 times within 20 cycles then alert
|
||||
if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
|
||||
if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
|
||||
if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
|
||||
|
||||
check process rsyslog with pidfile /var/run/rsyslogd.pid
|
||||
start program = "/bin/systemctl start rsyslog.service"
|
||||
@ -29,4 +29,5 @@ check process rsyslog with pidfile /var/run/rsyslogd.pid
|
||||
#
|
||||
check program routeCheck with path "/usr/bin/route_check.py"
|
||||
every 5 cycles
|
||||
if status != 0 then alert
|
||||
if status != 0 for 3 cycle then alert repeat every 1 cycles
|
||||
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -5,7 +5,7 @@
|
||||
## dsserve
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -4,4 +4,4 @@
|
||||
## syncd
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
@ -5,7 +5,7 @@
|
||||
## dsserve
|
||||
###############################################################################
|
||||
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
|
||||
if status != 0 for 5 times within 5 cycles then alert
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
64
src/monit/patch/0002-change_monit_alert_log_error.patch
Normal file
64
src/monit/patch/0002-change_monit_alert_log_error.patch
Normal file
@ -0,0 +1,64 @@
|
||||
From 97a5defc6a7fcc6a00f691bb5314ceb8fb7704e9 Mon Sep 17 00:00:00 2001
|
||||
From: Abhishek Dosi <abdosi@microsoft.com>
|
||||
Date: Mon, 26 Oct 2020 11:40:02 -0700
|
||||
Subject: [PATCH] Patch on top of commit Patch is addressing these changes:-
|
||||
|
||||
a) Enable repeat keyword for alert action . Using this we can log
|
||||
syslog error message for persistent failure condition
|
||||
|
||||
b) Make sure error message is loggged if state is changed to fail first time (fault tolerance condition)
|
||||
or we have repeat clause for alert
|
||||
|
||||
Signed-off-by: Abhishek Dosi <abdosi@microsoft.com>
|
||||
|
||||
---
|
||||
src/event.c | 6 +++++-
|
||||
src/p.y | 8 +++++++-
|
||||
2 files changed, 12 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/event.c b/src/event.c
|
||||
index ed363ee..9d08fc0 100644
|
||||
--- a/src/event.c
|
||||
+++ b/src/event.c
|
||||
@@ -336,7 +336,8 @@ static void _handleEvent(Service_T S, Event_T E) {
|
||||
if (E->state != State_Init || E->state_map & 0x1) {
|
||||
if (E->state == State_Succeeded || E->state == State_ChangedNot || E->id == Event_Instance || E->id == Event_Action)
|
||||
LogInfo("'%s' %s\n", S->name, E->message);
|
||||
- else
|
||||
+ /* Send Error log if state change to failed for 1st time or if we have repeat clause then do periodically */
|
||||
+ else if ((E->state_changed) || (E->state == State_Failed && E->action->failed->repeat && E->count % E->action->failed->repeat == 0))
|
||||
LogError("'%s' %s\n", S->name, E->message);
|
||||
}
|
||||
if (E->state == State_Init)
|
||||
return;
|
||||
diff --git a/src/p.y b/src/p.y
|
||||
index a57807d..b46b1a1 100644
|
||||
--- a/src/p.y
|
||||
+++ b/src/p.y
|
||||
@@ -2250,9 +2250,12 @@ repeat : /* EMPTY */ {
|
||||
}
|
||||
;
|
||||
|
||||
-action : ALERT {
|
||||
+action : ALERT repeat{
|
||||
$<number>$ = Action_Alert;
|
||||
}
|
||||
+ | ALERT {
|
||||
+ $<number>$ = Action_Alert;
|
||||
+ }
|
||||
| EXEC argumentlist repeat {
|
||||
$<number>$ = Action_Exec;
|
||||
}
|
||||
@@ -2281,6 +2284,9 @@ action1 : action {
|
||||
repeat = 0;
|
||||
command1 = command;
|
||||
command = NULL;
|
||||
+ } else if ($<number>1 == Action_Alert) {
|
||||
+ repeat1 = repeat;
|
||||
+ repeat = 0;
|
||||
}
|
||||
}
|
||||
;
|
||||
--
|
||||
2.17.1
|
||||
|
@ -1,2 +1,3 @@
|
||||
# This series applies on GIT commit dc9bc1c949125140d967edfc598dfad47eedc552
|
||||
0001-used_system_memory_sysdep-Use-MemAvailable-value-if-.patch
|
||||
0002-change_monit_alert_log_error.patch
|
||||
|
Loading…
Reference in New Issue
Block a user