[Monit] Restart telemetry container if memory usage is beyond the threshold (#7645)

Signed-off-by: Yong Zhao yozhao@microsoft.com

Why I did it
This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold.

How I did it
I borrowed the system tool Monit to run a script memory_checker which will periodically check the memory usage of streaming telemetry container. If the memory usage of telemetry container is larger than the pre-defined threshold for 10 times during 20 cycles, then an alerting message will be written into syslog and at the same time Monit will run the script restart_service to restart the streaming telemetry container.

How to verify it
I verified this implementation on device str-7260cx3-acs-1.
This commit is contained in:
yozhao101 2021-05-28 11:13:44 -07:00 committed by GitHub
parent 0c5c4872dc
commit 37863ac854
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 221 additions and 0 deletions

View File

@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"

View File

@ -334,6 +334,10 @@ sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker
sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker
sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service
# Install custom-built openssh sshd

View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
memory_checker
This script is part of the feature which will restart the container if memory
usage of it is larger than the threshold value.
This script is used to check the memory usage of specified cotnainer and
is intended to be run by Monit. It will write an alerting message into
syslog if memory usage of the container is larger than the threshold value for X
times within Y cycles/minutes. Note that if print(...) statement in this script
was executed, the string in it will be appended to Monit syslog messages.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
"""
import argparse
import subprocess
import sys
import syslog
import re
def get_command_result(command):
"""Executes the command and return the resulting output.
Args:
command: A string contains the command to be executed.
Returns:
A string which contains the output of command.
"""
command_stdout = ""
try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
.format(command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
sys.exit(2)
return command_stdout.strip()
def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
Args:
container_name: A string represtents name of a container
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
Returns:
None.
"""
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
command_stdout = get_command_result(command)
mem_usage = command_stdout.split("/")[0].strip()
match_obj = re.match(r"\d+\.?\d*", mem_usage)
if match_obj:
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
mem_usage_unit = mem_usage[match_obj.end():]
mem_usage_bytes = 0.0
if mem_usage_unit == "B":
mem_usage_bytes = mem_usage_value
elif mem_usage_unit == "KiB":
mem_usage_bytes = mem_usage_value * 1024
elif mem_usage_unit == "MiB":
mem_usage_bytes = mem_usage_value * 1024 ** 2
elif mem_usage_unit == "GiB":
mem_usage_bytes = mem_usage_value * 1024 ** 3
if mem_usage_bytes > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
.format(mem_usage))
sys.exit(4)
def main():
parser = argparse.ArgumentParser(description="Check memory usage of a container \
and an alerting message will be written into syslog if memory usage \
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
parser.add_argument("container_name", help="container name")
# TODO: Currently the threshold value is hard coded as a command line argument and will
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
args = parser.parse_args()
check_memory_usage(args.container_name, args.threshold_value)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
restart_service
This script is part of the feature which will restart the container if memory
usage of it is larger than the threshold value.
This script is intended to be run by Monit and is used to restart the specified
container if the memory usage of it is larger than the threshold value for X
times within Y cycles/minutes.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
"""
import argparse
import sys
import syslog
import subprocess
def get_command_result(command):
"""Executes command and return the exit code, stdout and stderr.
Args:
command: A string contains the command to be executed.
Returns:
An integer contains the exit code.
A string contains the output of stdout.
A string contains the output of stderr.
"""
command_stdout = ""
command_stderr = ""
try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
return 1, command_stdout.strip(), command_stderr.strip()
except (OSError, ValueError) as err:
return 2, command_stdout.strip(), err
return 0, command_stdout.strip(), command_stderr.strip()
def reset_failed_flag(service_name):
"""Reset the failed status of a service.
Args:
service_name: Name of the service.
Returns:
None
"""
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name)
syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..."
.format(service_name))
exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command)
if exit_code == 0:
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'."
.format(service_name))
else:
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}"
.format(service_name, command_stderr))
def restart_service(service_name):
"""Reset the failed status of a service and then restart it.
Args:
service_name: Name of specified service.
Returns:
None.
"""
restart_command = "sudo systemctl restart {}.service".format(service_name)
reset_failed_flag(service_name)
syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name))
exit_code, command_stdout, command_stderr = get_command_result(restart_command)
if exit_code != 0:
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}"
.format(service_name, command_stderr))
def main():
parser = argparse.ArgumentParser(description="Restart a specific service",
usage="/usr/bin/restart_service <service_name>")
parser.add_argument("service_name", help="service name")
args = parser.parse_args()
restart_service(args.service_name)
if __name__ == "__main__":
main()