2021-05-28 13:13:44 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
"""
|
|
|
|
memory_checker
|
|
|
|
|
|
|
|
This script is part of the feature which will restart the container if memory
|
|
|
|
usage of it is larger than the threshold value.
|
|
|
|
|
|
|
|
This script is used to check the memory usage of specified cotnainer and
|
|
|
|
is intended to be run by Monit. It will write an alerting message into
|
|
|
|
syslog if memory usage of the container is larger than the threshold value for X
|
|
|
|
times within Y cycles/minutes. Note that if print(...) statement in this script
|
|
|
|
was executed, the string in it will be appended to Monit syslog messages.
|
|
|
|
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
|
|
this script:
|
|
|
|
|
|
|
|
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
|
|
|
|
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
|
|
|
|
"""
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
import syslog
|
|
|
|
import re
|
2023-01-24 17:30:41 -06:00
|
|
|
import time
|
2021-05-28 13:13:44 -05:00
|
|
|
|
2022-06-17 14:13:18 -05:00
|
|
|
import docker
|
|
|
|
|
2022-11-07 11:57:57 -06:00
|
|
|
from swsscommon import swsscommon
|
|
|
|
|
|
|
|
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
|
|
|
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded"
|
2021-05-28 13:13:44 -05:00
|
|
|
|
|
|
|
def get_command_result(command):
|
|
|
|
"""Executes the command and return the resulting output.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
command: A string contains the command to be executed.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A string which contains the output of command.
|
|
|
|
"""
|
|
|
|
command_stdout = ""
|
|
|
|
|
|
|
|
try:
|
|
|
|
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
2022-11-04 09:48:51 -05:00
|
|
|
universal_newlines=True)
|
2021-05-28 13:13:44 -05:00
|
|
|
command_stdout, command_stderr = proc_instance.communicate()
|
|
|
|
if proc_instance.returncode != 0:
|
|
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
|
|
|
|
.format(command, proc_instance.returncode))
|
|
|
|
sys.exit(1)
|
|
|
|
except (OSError, ValueError) as err:
|
|
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
|
|
|
|
.format(command, err))
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
return command_stdout.strip()
|
|
|
|
|
2023-01-24 17:30:41 -06:00
|
|
|
|
|
|
|
def publish_events(container_name, mem_usage_bytes, threshold_value):
|
|
|
|
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
2022-11-07 11:57:57 -06:00
|
|
|
params = swsscommon.FieldValueMap()
|
|
|
|
params["ctr_name"] = container_name
|
|
|
|
params["mem_usage"] = mem_usage_bytes
|
|
|
|
params["threshold"] = threshold_value
|
|
|
|
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
|
2023-01-24 17:30:41 -06:00
|
|
|
swsscommon.events_deinit_publisher(events_handle)
|
|
|
|
|
2021-05-28 13:13:44 -05:00
|
|
|
|
2023-01-24 17:30:41 -06:00
|
|
|
def check_memory_usage(container_name, threshold_value):
|
2021-05-28 13:13:44 -05:00
|
|
|
"""Checks the memory usage of a container and writes an alerting messages into
|
|
|
|
the syslog if the memory usage is larger than the threshold value.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
container_name: A string represtents name of a container
|
|
|
|
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
None.
|
|
|
|
"""
|
2022-11-04 09:48:51 -05:00
|
|
|
command = ["docker", "stats", "--no-stream", "--format", "{{.MemUsage}}", container_name]
|
2021-05-28 13:13:44 -05:00
|
|
|
command_stdout = get_command_result(command)
|
|
|
|
mem_usage = command_stdout.split("/")[0].strip()
|
|
|
|
match_obj = re.match(r"\d+\.?\d*", mem_usage)
|
|
|
|
if match_obj:
|
|
|
|
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
|
|
|
|
mem_usage_unit = mem_usage[match_obj.end():]
|
|
|
|
|
|
|
|
mem_usage_bytes = 0.0
|
|
|
|
if mem_usage_unit == "B":
|
|
|
|
mem_usage_bytes = mem_usage_value
|
|
|
|
elif mem_usage_unit == "KiB":
|
|
|
|
mem_usage_bytes = mem_usage_value * 1024
|
|
|
|
elif mem_usage_unit == "MiB":
|
|
|
|
mem_usage_bytes = mem_usage_value * 1024 ** 2
|
|
|
|
elif mem_usage_unit == "GiB":
|
|
|
|
mem_usage_bytes = mem_usage_value * 1024 ** 3
|
|
|
|
|
|
|
|
if mem_usage_bytes > threshold_value:
|
|
|
|
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
|
|
|
.format(container_name, mem_usage_bytes, threshold_value))
|
[Monit] Fix the issue which shows Monit can not reset its counter. (#10288)
Signed-off-by: Yong Zhao <yozhao@microsoft.com>
Why I did it
This PR aims to fix the Monit issue which shows Monit can't reset its counter when monitoring memory usage of telemetry container.
Specifically the Monit configuration file related to monitoring memory usage of telemetry container is as following:
check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"
If memory usage of telemetry container is larger than 400MB for 10 times within 20 cycles (minutes), then it will be restarted.
Recently we observed, after telemetry container was restarted, its memory usage continuously increased from 400MB to 11GB within 1 hour, but it was not restarted anymore during this 1 hour sliding window.
The reason is Monit can't reset its counter to count again and Monit can reset its counter if and only if the status of monitored service was changed from Status failed to Status ok. However, during this 1 hour sliding window, the status of monitored service was not changed from Status failed to Status ok.
Currently for each service monitored by Monit, there will be an entry showing the monitoring status, monitoring mode etc. For example, the following output from command sudo monit status shows the status of monitored service to monitor memory usage of telemetry:
Program 'container_memory_telemetry'
status Status ok
monitoring status Monitored
monitoring mode active
on reboot start
last exit value 0
last output -
data collected Sat, 19 Mar 2022 19:56:26
Every 1 minute, Monit will run the script to check the memory usage of telemetry and update the counter if memory usage is larger than 400MB. If Monit checked the counter and found memory usage of telemetry is larger than 400MB for 10 times
within 20 minutes, then telemetry container was restarted. Following is an example status of monitored service:
Program 'container_memory_telemetry'
status Status failed
monitoring status Monitored
monitoring mode active
on reboot start
last exit value 0
last output -
data collected Tue, 01 Feb 2022 22:52:55
After telemetry container was restarted. we found memory usage of telemetry increased rapidly from around 100MB to more than 400MB during 1 minute and status of monitored service did not have a chance to be changed from Status failed to Status ok.
How I did it
In order to provide a workaround for this issue, Monit recently introduced another syntax format repeat every <n> cycles related to exec. This new syntax format will enable Monit repeat executing the background script if the error persists for a given number of cycles.
How to verify it
I verified this change on lab device str-s6000-acs-12. Another pytest PR (Azure/sonic-mgmt#5492) is submitted in sonic-mgmt repo for review.
2022-04-20 20:08:06 -05:00
|
|
|
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
2022-06-17 14:13:18 -05:00
|
|
|
.format(container_name, mem_usage_bytes, threshold_value))
|
2022-11-07 11:57:57 -06:00
|
|
|
# publish event
|
2023-01-24 17:30:41 -06:00
|
|
|
publish_events(container_name, "{:.2f}".format(mem_usage_bytes), str(threshold_value))
|
2021-05-28 13:13:44 -05:00
|
|
|
sys.exit(3)
|
|
|
|
else:
|
|
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
|
|
|
|
.format(mem_usage))
|
|
|
|
sys.exit(4)
|
|
|
|
|
|
|
|
|
2022-07-27 18:18:36 -05:00
|
|
|
def is_service_active(service_name):
|
|
|
|
"""Test if service is running.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
service_name: A string contains the service name
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
True if service is running, False otherwise
|
|
|
|
"""
|
2022-11-04 09:48:51 -05:00
|
|
|
status = subprocess.run(["systemctl", "is-active", "--quiet", service_name])
|
2022-07-27 18:18:36 -05:00
|
|
|
return status.returncode == 0
|
|
|
|
|
|
|
|
|
2022-06-17 14:13:18 -05:00
|
|
|
def get_running_container_names():
|
|
|
|
"""Retrieves names of running containers by talking to the docker daemon.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
running_container_names: A list indicates names of running containers.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
docker_client = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
|
|
running_container_list = docker_client.containers.list(filters={"status": "running"})
|
|
|
|
running_container_names = [ container.name for container in running_container_list ]
|
|
|
|
except (docker.errors.APIError, docker.errors.DockerException) as err:
|
[memory_checker] Add a specific log message in a case when the docker service is not running. (#16018)
#### Why I did it
To fix the logic introduced by [[memory_checker] Do not check memory usage of containers which are not created #11129](https://github.com/sonic-net/sonic-buildimage/pull/11129).
There could be a scenario before the reboot, where
1. The `docker service` has stopped
2. In a very short period of time, the monit service performs the `root@sonic:/home/admin# monit status container_memory_telemetry`
In such scenario, the `memory_checker` script will throw an error to the syslog:
```
ERR memory_checker: Failed to retrieve the running container list from docker daemon! Error message is: 'Error while fetching server API version: ('Connection aborted.', FileNotFoundError(2, 'No such file or directory'))'
```
But, actually, this scenario is a correct behavior, because when the docker service is stopped, the Unix socket is destroyed and that is why we could see the `FileNotFoundError(2, 'No such file or directory'` exception in the syslog.
#### How I did it
Change the log severity to the warning and changed the return value.
#### How to verify it
It is really hard to catch the exact moment described in the `Why I did it` section.
In order to check the logic:
1. Change the Unix socket path to non-existing in [/usr/bin/memory_checker](https://github.com/sonic-net/sonic-buildimage/blob/47742dfc2c0d1fa27198d69c9183ddc044e11b22/files/image_config/monit/memory_checker#L139) file on the switch.
2. Execute the `root@sonic:/home/admin# monit restart container_memory_telemetry`
3. Check the syslog for such messages:
```
WARNING memory_checker: Failed to retrieve the running container list from docker daemon! Error message is: 'Error while fetching server API version: ('Connection aborte
d.', FileNotFoundError(2, 'No such file or directory'))'
INFO memory_checker: [memory_checker] Exits without checking memory usage since container 'telemetry' is not running!
```
2023-08-31 13:28:20 -05:00
|
|
|
if not is_service_active("docker"):
|
|
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
|
|
"[memory_checker] Docker service is not running. Error message is: '{}'".format(err))
|
|
|
|
return []
|
|
|
|
|
2022-06-17 14:13:18 -05:00
|
|
|
syslog.syslog(syslog.LOG_ERR,
|
|
|
|
"Failed to retrieve the running container list from docker daemon! Error message is: '{}'"
|
|
|
|
.format(err))
|
|
|
|
sys.exit(5)
|
|
|
|
|
|
|
|
return running_container_names
|
|
|
|
|
|
|
|
|
2021-05-28 13:13:44 -05:00
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description="Check memory usage of a container \
|
|
|
|
and an alerting message will be written into syslog if memory usage \
|
|
|
|
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
|
|
|
|
parser.add_argument("container_name", help="container name")
|
|
|
|
# TODO: Currently the threshold value is hard coded as a command line argument and will
|
|
|
|
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
|
|
|
|
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2022-07-27 18:18:36 -05:00
|
|
|
if not is_service_active("docker"):
|
|
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
|
|
"[memory_checker] Exits without checking memory usage of container '{}' since docker daemon is not running!"
|
|
|
|
.format(args.container_name))
|
|
|
|
sys.exit(0)
|
|
|
|
|
2022-06-17 14:13:18 -05:00
|
|
|
running_container_names = get_running_container_names()
|
|
|
|
if args.container_name in running_container_names:
|
2023-01-24 17:30:41 -06:00
|
|
|
check_memory_usage(args.container_name, args.threshold_value)
|
2022-06-17 14:13:18 -05:00
|
|
|
else:
|
|
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
|
|
"[memory_checker] Exits without checking memory usage since container '{}' is not running!"
|
|
|
|
.format(args.container_name))
|
2023-01-24 17:30:41 -06:00
|
|
|
|
2021-05-28 13:13:44 -05:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|