43340cd58d
#### Why I did it
To fix the logic introduced by [[memory_checker] Do not check memory usage of containers which are not created #11129](https://github.com/sonic-net/sonic-buildimage/pull/11129).
There could be a scenario before the reboot, where
1. The `docker service` has stopped
2. In a very short period of time, the monit service performs the `root@sonic:/home/admin# monit status container_memory_telemetry`
In such scenario, the `memory_checker` script will throw an error to the syslog:
```
ERR memory_checker: Failed to retrieve the running container list from docker daemon! Error message is: 'Error while fetching server API version: ('Connection aborted.', FileNotFoundError(2, 'No such file or directory'))'
```
But, actually, this scenario is a correct behavior, because when the docker service is stopped, the Unix socket is destroyed and that is why we could see the `FileNotFoundError(2, 'No such file or directory'` exception in the syslog.
#### How I did it
Change the log severity to the warning and changed the return value.
#### How to verify it
It is really hard to catch the exact moment described in the `Why I did it` section.
In order to check the logic:
1. Change the Unix socket path to non-existing in [/usr/bin/memory_checker](47742dfc2c/files/image_config/monit/memory_checker (L139)
) file on the switch.
2. Execute the `root@sonic:/home/admin# monit restart container_memory_telemetry`
3. Check the syslog for such messages:
```
WARNING memory_checker: Failed to retrieve the running container list from docker daemon! Error message is: 'Error while fetching server API version: ('Connection aborte
d.', FileNotFoundError(2, 'No such file or directory'))'
INFO memory_checker: [memory_checker] Exits without checking memory usage since container 'telemetry' is not running!
```
183 lines
7.1 KiB
Python
Executable File
183 lines
7.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
memory_checker
|
|
|
|
This script is part of the feature which will restart the container if memory
|
|
usage of it is larger than the threshold value.
|
|
|
|
This script is used to check the memory usage of specified cotnainer and
|
|
is intended to be run by Monit. It will write an alerting message into
|
|
syslog if memory usage of the container is larger than the threshold value for X
|
|
times within Y cycles/minutes. Note that if print(...) statement in this script
|
|
was executed, the string in it will be appended to Monit syslog messages.
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
this script:
|
|
|
|
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
|
|
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
|
|
"""
|
|
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
import syslog
|
|
import re
|
|
import time
|
|
|
|
import docker
|
|
|
|
from swsscommon import swsscommon
|
|
|
|
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
|
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded"
|
|
|
|
def get_command_result(command):
|
|
"""Executes the command and return the resulting output.
|
|
|
|
Args:
|
|
command: A string contains the command to be executed.
|
|
|
|
Returns:
|
|
A string which contains the output of command.
|
|
"""
|
|
command_stdout = ""
|
|
|
|
try:
|
|
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
universal_newlines=True)
|
|
command_stdout, command_stderr = proc_instance.communicate()
|
|
if proc_instance.returncode != 0:
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
|
|
.format(command, proc_instance.returncode))
|
|
sys.exit(1)
|
|
except (OSError, ValueError) as err:
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
|
|
.format(command, err))
|
|
sys.exit(2)
|
|
|
|
return command_stdout.strip()
|
|
|
|
|
|
def publish_events(container_name, mem_usage_bytes, threshold_value):
|
|
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
|
params = swsscommon.FieldValueMap()
|
|
params["ctr_name"] = container_name
|
|
params["mem_usage"] = mem_usage_bytes
|
|
params["threshold"] = threshold_value
|
|
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
|
|
swsscommon.events_deinit_publisher(events_handle)
|
|
|
|
|
|
def check_memory_usage(container_name, threshold_value):
|
|
"""Checks the memory usage of a container and writes an alerting messages into
|
|
the syslog if the memory usage is larger than the threshold value.
|
|
|
|
Args:
|
|
container_name: A string represtents name of a container
|
|
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
|
|
|
|
Returns:
|
|
None.
|
|
"""
|
|
command = ["docker", "stats", "--no-stream", "--format", "{{.MemUsage}}", container_name]
|
|
command_stdout = get_command_result(command)
|
|
mem_usage = command_stdout.split("/")[0].strip()
|
|
match_obj = re.match(r"\d+\.?\d*", mem_usage)
|
|
if match_obj:
|
|
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
|
|
mem_usage_unit = mem_usage[match_obj.end():]
|
|
|
|
mem_usage_bytes = 0.0
|
|
if mem_usage_unit == "B":
|
|
mem_usage_bytes = mem_usage_value
|
|
elif mem_usage_unit == "KiB":
|
|
mem_usage_bytes = mem_usage_value * 1024
|
|
elif mem_usage_unit == "MiB":
|
|
mem_usage_bytes = mem_usage_value * 1024 ** 2
|
|
elif mem_usage_unit == "GiB":
|
|
mem_usage_bytes = mem_usage_value * 1024 ** 3
|
|
|
|
if mem_usage_bytes > threshold_value:
|
|
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
|
.format(container_name, mem_usage_bytes, threshold_value))
|
|
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
|
.format(container_name, mem_usage_bytes, threshold_value))
|
|
# publish event
|
|
publish_events(container_name, "{:.2f}".format(mem_usage_bytes), str(threshold_value))
|
|
sys.exit(3)
|
|
else:
|
|
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
|
|
.format(mem_usage))
|
|
sys.exit(4)
|
|
|
|
|
|
def is_service_active(service_name):
|
|
"""Test if service is running.
|
|
|
|
Args:
|
|
service_name: A string contains the service name
|
|
|
|
Returns:
|
|
True if service is running, False otherwise
|
|
"""
|
|
status = subprocess.run(["systemctl", "is-active", "--quiet", service_name])
|
|
return status.returncode == 0
|
|
|
|
|
|
def get_running_container_names():
|
|
"""Retrieves names of running containers by talking to the docker daemon.
|
|
|
|
Args:
|
|
None.
|
|
|
|
Returns:
|
|
running_container_names: A list indicates names of running containers.
|
|
"""
|
|
try:
|
|
docker_client = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
running_container_list = docker_client.containers.list(filters={"status": "running"})
|
|
running_container_names = [ container.name for container in running_container_list ]
|
|
except (docker.errors.APIError, docker.errors.DockerException) as err:
|
|
if not is_service_active("docker"):
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
"[memory_checker] Docker service is not running. Error message is: '{}'".format(err))
|
|
return []
|
|
|
|
syslog.syslog(syslog.LOG_ERR,
|
|
"Failed to retrieve the running container list from docker daemon! Error message is: '{}'"
|
|
.format(err))
|
|
sys.exit(5)
|
|
|
|
return running_container_names
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Check memory usage of a container \
|
|
and an alerting message will be written into syslog if memory usage \
|
|
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
|
|
parser.add_argument("container_name", help="container name")
|
|
# TODO: Currently the threshold value is hard coded as a command line argument and will
|
|
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
|
|
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
|
|
args = parser.parse_args()
|
|
|
|
if not is_service_active("docker"):
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
"[memory_checker] Exits without checking memory usage of container '{}' since docker daemon is not running!"
|
|
.format(args.container_name))
|
|
sys.exit(0)
|
|
|
|
running_container_names = get_running_container_names()
|
|
if args.container_name in running_container_names:
|
|
check_memory_usage(args.container_name, args.threshold_value)
|
|
else:
|
|
syslog.syslog(syslog.LOG_INFO,
|
|
"[memory_checker] Exits without checking memory usage since container '{}' is not running!"
|
|
.format(args.container_name))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|