[auto-ts] add memory check (#10433) (#12291)

#### Why I did it

To support automatic techsupport invokation in case memory usage is too high.

#### How I did it

Implemented according to https://github.com/Azure/SONiC/pull/939

#### How to verify it

UT, manual test on the switch.

*DEPENDS* on https://github.com/Azure/sonic-utilities/pull/2116
This commit is contained in:
Stepan Blyshchak 2022-10-06 18:06:46 +03:00 committed by GitHub
parent 2b36f81063
commit 06f8b1f98a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 103 additions and 10 deletions

View File

@ -84,6 +84,8 @@
"rate_limit_interval" : "180",
"max_techsupport_limit" : "10.0",
"max_core_limit" : "5.0",
"available_mem_threshold": "10.0",
"min_available_mem": "200",
"since" : "2 days ago"
}
},
@ -93,7 +95,8 @@
{%- if enable_auto_tech_support == "y" %}
"state" : "enabled", {% else %}
"state" : "disabled", {% endif %}
"rate_limit_interval" : "600"
"rate_limit_interval" : "600",
"available_mem_threshold": "10.0"
}{%if not loop.last %},{% endif -%}
{% endfor %}
},

View File

@ -46,3 +46,6 @@ check program vnetRouteCheck with path "/usr/local/bin/vnet_route_check.py"
every 5 cycles
if status != 0 for 3 cycle then alert repeat every 1 cycles
# memory_check tool that verifies that memory usage does not cross the threshold or invokes techsupport.
check program memory_check with path "/usr/local/bin/memory_threshold_check.py"
if status == 2 for 10 times within 20 cycles then exec "/usr/local/bin/memory_threshold_check_handler.py"

View File

@ -8,7 +8,7 @@
},
"AUTO_TECHSUPPORT_INVALID_RATE_LIMIT_FORMAT": {
"desc" : "Configure cooloff with a value of invalid format",
"eStrKey": "InvalidValue"
"eStrKey": "InvalidValue"
},
"AUTO_TECHSUPPORT_OUT_OF_RANGE_DECIMAL": {
"desc" : "Configure a value for core-uage outside the range [0, 100)",
@ -19,9 +19,23 @@
},
"AUTO_TECHSUPPORT_INVALID_FRACTION_DIGITS": {
"desc" : "Configure a value for max_techsupport_size inside the range [0, 100) but with 3 fractional digits",
"eStrKey": "InvalidValue"
"eStrKey": "InvalidValue"
},
"AUTO_TECHSUPPORT_RATE_LIMIT_INTERVAL_TEST": {
"desc" : "Configure and test the valid configuration"
},
"AUTO_TECHSUPPORT_AVAILABLE_MEM_THRESHOLD": {
"desc" : "Configure and test the valid configuration"
},
"AUTO_TECHSUPPORT_INVALID_AVAILABLE_MEM_THRESHOLD": {
"desc" : "Configure a value for available_mem_threshold inside the range [0, 100) but with 3 fractional digits",
"eStrKey": "InvalidValue"
},
"AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_VALID": {
"desc" : "Configure and test the valid configuration"
},
"AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_INVALID_THRESHOLD": {
"desc" : "Configure a value for available_mem_threshold inside the range [0, 100) but with 3 fractional digits",
"eStrKey": "InvalidValue"
}
}

View File

@ -8,7 +8,7 @@
"max_techsupport_limit" : "10.0",
"max_core_limit" : "5.0",
"since" : "2 days ago"
}
}
}
}
},
@ -20,8 +20,8 @@
"rate_limit_interval" : "180",
"max_techsupport_limit" : "10.0",
"max_core_limit" : "5.0",
"since" : "2 days ago"
}
"since" : "2 days ago"
}
}
}
},
@ -30,7 +30,7 @@
"sonic-auto_techsupport:AUTO_TECHSUPPORT": {
"sonic-auto_techsupport:GLOBAL": {
"rate_limit_interval" : "whatever"
}
}
}
}
},
@ -40,7 +40,7 @@
"sonic-auto_techsupport:GLOBAL": {
"max_core_limit" : "100.00",
"rate_limit_interval" : "180"
}
}
}
}
},
@ -50,7 +50,7 @@
"sonic-auto_techsupport:GLOBAL": {
"max_techsupport_limit" : "11.23",
"max_core_limit" : "99.99"
}
}
}
}
},
@ -60,7 +60,7 @@
"sonic-auto_techsupport:GLOBAL": {
"max_techsupport_limit" : "11.111",
"max_core_limit" : "99.99"
}
}
}
}
},
@ -81,5 +81,60 @@
]
}
}
},
"AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_VALID": {
"sonic-auto_techsupport:sonic-auto_techsupport": {
"sonic-auto_techsupport:AUTO_TECHSUPPORT": {
"sonic-auto_techsupport:GLOBAL": {
"available_mem_threshold": "10.0",
"min_available_mem": "900"
}
}
}
},
"AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_INVALID_THRESHOLD": {
"sonic-auto_techsupport:sonic-auto_techsupport": {
"sonic-auto_techsupport:AUTO_TECHSUPPORT": {
"sonic-auto_techsupport:GLOBAL": {
"available_mem_threshold": "11.111"
}
}
}
},
"AUTO_TECHSUPPORT_AVAILABLE_MEM_THRESHOLD": {
"sonic-auto_techsupport:sonic-auto_techsupport": {
"sonic-auto_techsupport:AUTO_TECHSUPPORT_FEATURE": {
"AUTO_TECHSUPPORT_FEATURE_LIST": [
{
"feature_name" : "bgp",
"state" : "enabled",
"available_mem_threshold": "10.0"
},
{
"feature_name" : "swss",
"state" : "disabled",
"available_mem_threshold": "10.0"
}
]
}
}
},
"AUTO_TECHSUPPORT_INVALID_AVAILABLE_MEM_THRESHOLD": {
"sonic-auto_techsupport:sonic-auto_techsupport": {
"sonic-auto_techsupport:AUTO_TECHSUPPORT_FEATURE": {
"AUTO_TECHSUPPORT_FEATURE_LIST": [
{
"feature_name" : "bgp",
"state" : "enabled",
"available_mem_threshold": "11.111"
},
{
"feature_name" : "swss",
"state" : "disabled",
"available_mem_threshold": "10.0"
}
]
}
}
}
}

View File

@ -59,6 +59,18 @@ module sonic-auto_techsupport {
description "Max Limit in percentage for the cummulative size of core dumps. No cleanup is performed if the value isn't congiured or is 0.0";
type decimal-repr;
}
leaf available_mem_threshold {
description "Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing";
type decimal-repr;
default 10.0;
}
leaf min_available_mem {
description "Minimum Free memory (in MB) that should be available for the techsupport execution to start";
type uint32;
default 200;
}
leaf since {
/*
@ -96,6 +108,12 @@ module sonic-auto_techsupport {
type stypes:admin_mode;
}
leaf available_mem_threshold {
description "Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing";
type decimal-repr;
default 10.0;
}
leaf rate_limit_interval {
description "Rate limit interval for the corresponding feature. Configure 0 to explicitly disable";
type uint16;