From dddf96933c1dccf2064790a452de87b0ac632abc Mon Sep 17 00:00:00 2001 From: abdosi <58047199+abdosi@users.noreply.github.com> Date: Sat, 31 Oct 2020 17:29:49 -0700 Subject: [PATCH] [monit] Adding patch to enhance syslog error message generation for monit alert action when status is failed. (#5720) Why/How I did: Make sure first error syslog is triggered based on FAULT TOLERANCE condition. Added support of repeat clause with alert action. This is used as trigger for generation of periodic syslog error messages if error is persistent Updated the monit conf files with repeat every x cycles for the alert action --- .../base_image_files/monit_database | 2 +- .../docker-fpm-frr/base_image_files/monit_bgp | 12 ++-- .../docker-lldp/base_image_files/monit_lldp | 6 +- .../base_image_files/monit_swss | 20 +++--- .../docker-sflow/base_image_files/monit_sflow | 2 +- .../docker-snmp/base_image_files/monit_snmp | 4 +- .../base_image_files/monit_restapi | 2 +- .../base_image_files/monit_telemetry | 4 +- .../docker-teamd/base_image_files/monit_teamd | 4 +- files/image_config/monit/conf.d/sonic-host | 13 ++-- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 +- .../0002-change_monit_alert_log_error.patch | 64 +++++++++++++++++++ src/monit/patch/series | 1 + 21 files changed, 111 insertions(+), 45 deletions(-) create mode 100644 src/monit/patch/0002-change_monit_alert_log_error.patch diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c1addd8a6f..47c9d1b2d4 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -4,4 +4,4 @@ ## redis_server ############################################################################### check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 4567d45e3c..3361b9e64f 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -9,19 +9,19 @@ ## bgpmon ############################################################################### check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program bgp|bgpmon with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpmon" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 194fa14a30..8dc2f3c153 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -6,10 +6,10 @@ ## lldpmgrd ############################################################################### check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index f5f4389f3f..da601011e7 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -13,31 +13,31 @@ ## vxlanmgrd ############################################################################## check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index 217f2e6258..84b36b18ce 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -4,4 +4,4 @@ ## sflowmgrd ############################################################################### check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index b1725378c0..6a368a9b60 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -5,7 +5,7 @@ ## snmpd_subagent ############################################################################### check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3 -m sonic_ax_impl" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 84e4366f4a..6752100b84 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -4,4 +4,4 @@ ## restapi ############################################################################### check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index 7365ce51d1..3680bbe6cf 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -5,7 +5,7 @@ ## dialout_client ############################################################################### check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd index 256482aef2..626a614560 100644 --- a/dockers/docker-teamd/base_image_files/monit_teamd +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -5,7 +5,7 @@ ## teammgrd ############################################################################### check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/files/image_config/monit/conf.d/sonic-host b/files/image_config/monit/conf.d/sonic-host index 3fd313e24b..202c49f8d7 100644 --- a/files/image_config/monit/conf.d/sonic-host +++ b/files/image_config/monit/conf.d/sonic-host @@ -6,15 +6,15 @@ ############################################################################### check filesystem root-overlay with path / - if space usage > 90% for 10 times within 20 cycles then alert + if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check filesystem var-log with path /var/log - if space usage > 90% for 10 times within 20 cycles then alert + if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check system $HOST - if memory usage > 90% for 10 times within 20 cycles then alert - if cpu usage (user) > 90% for 10 times within 20 cycles then alert - if cpu usage (system) > 90% for 10 times within 20 cycles then alert + if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles + if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles + if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles check process rsyslog with pidfile /var/run/rsyslogd.pid start program = "/bin/systemctl start rsyslog.service" @@ -29,4 +29,5 @@ check process rsyslog with pidfile /var/run/rsyslogd.pid # check program routeCheck with path "/usr/local/bin/route_check.py" every 5 cycles - if status != 0 then alert + if status != 0 for 3 cycle then alert repeat every 1 cycles + diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 1195487700..d63346d9ee 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -5,7 +5,7 @@ ## dsserve ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 14789c67c3..61e290e318 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -4,4 +4,4 @@ ## syncd ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 1195487700..d63346d9ee 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -5,7 +5,7 @@ ## dsserve ############################################################################### check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" - if status != 0 for 5 times within 5 cycles then alert + if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles diff --git a/src/monit/patch/0002-change_monit_alert_log_error.patch b/src/monit/patch/0002-change_monit_alert_log_error.patch new file mode 100644 index 0000000000..1e43078e62 --- /dev/null +++ b/src/monit/patch/0002-change_monit_alert_log_error.patch @@ -0,0 +1,64 @@ +From 97a5defc6a7fcc6a00f691bb5314ceb8fb7704e9 Mon Sep 17 00:00:00 2001 +From: Abhishek Dosi +Date: Mon, 26 Oct 2020 11:40:02 -0700 +Subject: [PATCH] Patch on top of commit Patch is addressing these changes:- + +a) Enable repeat keyword for alert action . Using this we can log +syslog error message for persistent failure condition + +b) Make sure error message is loggged if state is changed to fail first time (fault tolerance condition) +or we have repeat clause for alert + +Signed-off-by: Abhishek Dosi + +--- + src/event.c | 6 +++++- + src/p.y | 8 +++++++- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/src/event.c b/src/event.c +index ed363ee..9d08fc0 100644 +--- a/src/event.c ++++ b/src/event.c +@@ -336,7 +336,8 @@ static void _handleEvent(Service_T S, Event_T E) { + if (E->state != State_Init || E->state_map & 0x1) { + if (E->state == State_Succeeded || E->state == State_ChangedNot || E->id == Event_Instance || E->id == Event_Action) + LogInfo("'%s' %s\n", S->name, E->message); +- else ++ /* Send Error log if state change to failed for 1st time or if we have repeat clause then do periodically */ ++ else if ((E->state_changed) || (E->state == State_Failed && E->action->failed->repeat && E->count % E->action->failed->repeat == 0)) + LogError("'%s' %s\n", S->name, E->message); + } + if (E->state == State_Init) + return; +diff --git a/src/p.y b/src/p.y +index a57807d..b46b1a1 100644 +--- a/src/p.y ++++ b/src/p.y +@@ -2250,9 +2250,12 @@ repeat : /* EMPTY */ { + } + ; + +-action : ALERT { ++action : ALERT repeat{ + $$ = Action_Alert; + } ++ | ALERT { ++ $$ = Action_Alert; ++ } + | EXEC argumentlist repeat { + $$ = Action_Exec; + } +@@ -2281,6 +2284,9 @@ action1 : action { + repeat = 0; + command1 = command; + command = NULL; ++ } else if ($1 == Action_Alert) { ++ repeat1 = repeat; ++ repeat = 0; + } + } + ; +-- +2.17.1 + diff --git a/src/monit/patch/series b/src/monit/patch/series index 15fcdd50c8..f5534d0f55 100644 --- a/src/monit/patch/series +++ b/src/monit/patch/series @@ -1,2 +1,3 @@ # This series applies on GIT commit dc9bc1c949125140d967edfc598dfad47eedc552 0001-used_system_memory_sysdep-Use-MemAvailable-value-if-.patch +0002-change_monit_alert_log_error.patch