diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c1addd8a6f..47c9d1b2d4 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -4,4 +4,4 @@ ## redis_server ############################################################################### check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 4567d45e3c..3361b9e64f 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -9,19 +9,19 @@ ## bgpmon ############################################################################### check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpmon with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpmon" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 194fa14a30..8dc2f3c153 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -6,10 +6,10 @@ ## lldpmgrd ############################################################################### check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index f5f4389f3f..da601011e7 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -13,31 +13,31 @@ ## vxlanmgrd ############################################################################## check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index 217f2e6258..84b36b18ce 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -4,4 +4,4 @@ ## sflowmgrd ############################################################################### check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index b1725378c0..6a368a9b60 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -5,7 +5,7 @@ ## snmpd_subagent ############################################################################### check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3 -m sonic_ax_impl" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 84e4366f4a..6752100b84 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -4,4 +4,4 @@ ## restapi ############################################################################### check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index 7365ce51d1..3680bbe6cf 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -5,7 +5,7 @@ ## dialout_client ############################################################################### check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd index 256482aef2..626a614560 100644 --- a/dockers/docker-teamd/base_image_files/monit_teamd +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -5,7 +5,7 @@ ## teammgrd ############################################################################### check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/files/image_config/monit/conf.d/sonic-host b/files/image_config/monit/conf.d/sonic-host index 3fd313e24b..202c49f8d7 100644 --- a/files/image_config/monit/conf.d/sonic-host +++ b/files/image_config/monit/conf.d/sonic-host @@ -6,15 +6,15 @@ ############################################################################### check filesystem root-overlay with path / - if space usage > 90% for 10 times within 20 cycles then alert + if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check filesystem var-log with path /var/log - if space usage > 90% for 10 times within 20 cycles then alert + if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check system $HOST - if memory usage > 90% for 10 times within 20 cycles then alert - if cpu usage (user) > 90% for 10 times within 20 cycles then alert - if cpu usage (system) > 90% for 10 times within 20 cycles then alert + if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles + if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles + if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check process rsyslog with pidfile /var/run/rsyslogd.pid start program = "/bin/systemctl start rsyslog.service" @@ -29,4 +29,5 @@ check process rsyslog with pidfile /var/run/rsyslogd.pid # check program routeCheck with path "/usr/local/bin/route_check.py" every 5 cycles - if status != 0 then alert + if status != 0 for 3 cycle then alert repeat every 1 cycles + diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 1195487700..d63346d9ee 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -5,7 +5,7 @@ ## dsserve ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 1195487700..d63346d9ee 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -5,7 +5,7 @@ ## dsserve ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/src/monit/patch/0002-change_monit_alert_log_error.patch b/src/monit/patch/0002-change_monit_alert_log_error.patch new file mode 100644 index 0000000000..1e43078e62 --- /dev/null +++ b/src/monit/patch/0002-change_monit_alert_log_error.patch @@ -0,0 +1,64 @@ +From 97a5defc6a7fcc6a00f691bb5314ceb8fb7704e9 Mon Sep 17 00:00:00 2001 +From: Abhishek Dosi +Date: Mon, 26 Oct 2020 11:40:02 -0700 +Subject: [PATCH] Patch on top of commit Patch is addressing these changes:- + +a) Enable repeat keyword for alert action . Using this we can log +syslog error message for persistent failure condition + +b) Make sure error message is loggged if state is changed to fail first time (fault tolerance condition) +or we have repeat clause for alert + +Signed-off-by: Abhishek Dosi + +--- + src/event.c | 6 +++++- + src/p.y | 8 +++++++- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/src/event.c b/src/event.c +index ed363ee..9d08fc0 100644 +--- a/src/event.c ++++ b/src/event.c +@@ -336,7 +336,8 @@ static void _handleEvent(Service_T S, Event_T E) { + if (E->state != State_Init || E->state_map & 0x1) { + if (E->state == State_Succeeded || E->state == State_ChangedNot || E->id == Event_Instance || E->id == Event_Action) + LogInfo("'%s' %s\n", S->name, E->message); +- else ++ /* Send Error log if state change to failed for 1st time or if we have repeat clause then do periodically */ ++ else if ((E->state_changed) || (E->state == State_Failed && E->action->failed->repeat && E->count % E->action->failed->repeat == 0)) + LogError("'%s' %s\n", S->name, E->message); + } + if (E->state == State_Init) + return; +diff --git a/src/p.y b/src/p.y +index a57807d..b46b1a1 100644 +--- a/src/p.y ++++ b/src/p.y +@@ -2250,9 +2250,12 @@ repeat : /* EMPTY */ { + } + ; + +-action : ALERT { ++action : ALERT repeat{ + $$ = Action_Alert; + } ++ | ALERT { ++ $$ = Action_Alert; ++ } + | EXEC argumentlist repeat { + $$ = Action_Exec; + } +@@ -2281,6 +2284,9 @@ action1 : action { + repeat = 0; + command1 = command; + command = NULL; ++ } else if ($1 == Action_Alert) { ++ repeat1 = repeat; ++ repeat = 0; + } + } + ; +-- +2.17.1 + diff --git a/src/monit/patch/series b/src/monit/patch/series index 15fcdd50c8..f5534d0f55 100644 --- a/src/monit/patch/series +++ b/src/monit/patch/series @@ -1,2 +1,3 @@ # This series applies on GIT commit dc9bc1c949125140d967edfc598dfad47eedc552 0001-used_system_memory_sysdep-Use-MemAvailable-value-if-.patch +0002-change_monit_alert_log_error.patch