sonic-buildimage/files/image_config/monit/memory_checker
Lior Avramov ff3ad9ddd1 [memory_checker] Do not check memory usage of containers if docker daemon is not running (#11476)
Fix in Monit memory_checker plugin. Skip fetching running containers if docker engine is down (can happen in deinit).
This PR fixes issue #11472.

Signed-off-by: liora liora@nvidia.com

Why I did it
In the case where Monit runs during deinit flow, memory_checker plugin is fetching the running containers without checking if Docker service is still running. I added this check.

How I did it
Use systemctl is-active to check if Docker engine is still running.

How to verify it
Use systemctl to stop docker engine and reload Monit, no errors in log and relevant print appears in log.

Which release branch to backport (provide reason below if selected)
The fix is required in 202205 and 202012 since the PR that introduced the issue was cherry picked to those branches (#11129).
2022-07-28 20:37:22 +00:00

161 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
memory_checker
This script is part of the feature which will restart the container if memory
usage of it is larger than the threshold value.
This script is used to check the memory usage of specified cotnainer and
is intended to be run by Monit. It will write an alerting message into
syslog if memory usage of the container is larger than the threshold value for X
times within Y cycles/minutes. Note that if print(...) statement in this script
was executed, the string in it will be appended to Monit syslog messages.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
"""
import argparse
import subprocess
import sys
import syslog
import re
import docker
def get_command_result(command):
"""Executes the command and return the resulting output.
Args:
command: A string contains the command to be executed.
Returns:
A string which contains the output of command.
"""
command_stdout = ""
try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
.format(command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
sys.exit(2)
return command_stdout.strip()
def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
Args:
container_name: A string represtents name of a container
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
Returns:
None.
"""
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
command_stdout = get_command_result(command)
mem_usage = command_stdout.split("/")[0].strip()
match_obj = re.match(r"\d+\.?\d*", mem_usage)
if match_obj:
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
mem_usage_unit = mem_usage[match_obj.end():]
mem_usage_bytes = 0.0
if mem_usage_unit == "B":
mem_usage_bytes = mem_usage_value
elif mem_usage_unit == "KiB":
mem_usage_bytes = mem_usage_value * 1024
elif mem_usage_unit == "MiB":
mem_usage_bytes = mem_usage_value * 1024 ** 2
elif mem_usage_unit == "GiB":
mem_usage_bytes = mem_usage_value * 1024 ** 3
if mem_usage_bytes > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
.format(mem_usage))
sys.exit(4)
def is_service_active(service_name):
"""Test if service is running.
Args:
service_name: A string contains the service name
Returns:
True if service is running, False otherwise
"""
status = subprocess.run("systemctl is-active --quiet {}".format(service_name), shell=True, check=False)
return status.returncode == 0
def get_running_container_names():
"""Retrieves names of running containers by talking to the docker daemon.
Args:
None.
Returns:
running_container_names: A list indicates names of running containers.
"""
try:
docker_client = docker.DockerClient(base_url='unix://var/run/docker.sock')
running_container_list = docker_client.containers.list(filters={"status": "running"})
running_container_names = [ container.name for container in running_container_list ]
except (docker.errors.APIError, docker.errors.DockerException) as err:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the running container list from docker daemon! Error message is: '{}'"
.format(err))
sys.exit(5)
return running_container_names
def main():
parser = argparse.ArgumentParser(description="Check memory usage of a container \
and an alerting message will be written into syslog if memory usage \
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
parser.add_argument("container_name", help="container name")
# TODO: Currently the threshold value is hard coded as a command line argument and will
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
args = parser.parse_args()
if not is_service_active("docker"):
syslog.syslog(syslog.LOG_INFO,
"[memory_checker] Exits without checking memory usage of container '{}' since docker daemon is not running!"
.format(args.container_name))
sys.exit(0)
running_container_names = get_running_container_names()
if args.container_name in running_container_names:
check_memory_usage(args.container_name, args.threshold_value)
else:
syslog.syslog(syslog.LOG_INFO,
"[memory_checker] Exits without checking memory usage since container '{}' is not running!"
.format(args.container_name))
if __name__ == "__main__":
main()