[201911][Monit] Restart telemetry container if memory usage is beyond the threshold (#7618)

This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold.
This commit is contained in:
yozhao101 2021-05-17 16:51:13 -07:00 committed by GitHub
parent 306d57e189
commit 24e1cde1e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 186 additions and 0 deletions

View File

@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"

View File

@ -209,6 +209,11 @@ sudo cp $IMAGE_CONFIGS/monit/generate_monit_config.service $FILESYSTEM_ROOT_USR_
echo "generate_monit_config.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/monit/generate_monit_config $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/generate_monit_config
sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker
sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service
# Copy crontabs
sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/

View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
import argparse
import subprocess
import sys
import syslog
import re
def get_command_result(command):
"""Executes the command and return the resulting output.
Args:
command: A string contains the command to be executed.
Returns:
A string which contains the output of command.
"""
command_stdout = ""
try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
.format(command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
sys.exit(2)
return command_stdout.strip()
def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
Args:
container_name: A string represtents name of a container
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
Returns:
None.
"""
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
command_stdout = get_command_result(command)
mem_usage = command_stdout.split("/")[0].strip()
match_obj = re.match(r"\d+\.?\d*", mem_usage)
if match_obj:
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
mem_usage_unit = mem_usage[match_obj.end():]
mem_usage_bytes = 0.0
if mem_usage_unit == "B":
mem_usage_bytes = mem_usage_value
elif mem_usage_unit == "KiB":
mem_usage_bytes = mem_usage_value * 1024
elif mem_usage_unit == "MiB":
mem_usage_bytes = mem_usage_value * 1024 ** 2
elif mem_usage_unit == "GiB":
mem_usage_bytes = mem_usage_value * 1024 ** 3
if mem_usage_bytes > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
.format(mem_usage))
sys.exit(4)
def main():
parser = argparse.ArgumentParser(description="Check memory usage of a container \
and an alerting message will be written into syslog if memory usage \
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
parser.add_argument("container_name", help="container name")
# TODO: Currently the threshold value is hard coded as a command line argument and will
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
args = parser.parse_args()
check_memory_usage(args.container_name, args.threshold_value)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
import argparse
import sys
import syslog
import subprocess
def get_command_result(command):
"""Executes command and return the exit code, stdout and stderr.
Args:
command: A string contains the command to be executed.
Returns:
An integer contains the exit code.
A string contains the output of stdout.
A string contains the output of stderr.
"""
command_stdout = ""
command_stderr = ""
try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
return 1, command_stdout.strip(), command_stderr.strip()
except (OSError, ValueError) as err:
return 2, command_stdout.strip(), err
return 0, command_stdout.strip(), command_stderr.strip()
def reset_failed_flag(service_name):
"""Reset the failed status of a service.
Args:
service_name: Name of the service.
Returns:
None
"""
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name)
syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..."
.format(service_name))
exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command)
if exit_code == 0:
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'."
.format(service_name))
else:
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}"
.format(service_name, command_stderr))
def restart_service(service_name):
"""Reset the failed status of a service and then restart it.
Args:
service_name: Name of specified service.
Returns:
None.
"""
restart_command = "sudo systemctl restart {}.service".format(service_name)
reset_failed_flag(service_name)
syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name))
exit_code, command_stdout, command_stderr = get_command_result(restart_command)
if exit_code != 0:
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}"
.format(service_name, command_stderr))
def main():
parser = argparse.ArgumentParser(description="Restart a specific service",
usage="/usr/bin/restart_service <service_name>")
parser.add_argument("service_name", help="service name")
args = parser.parse_args()
restart_service(args.service_name)
if __name__ == "__main__":
main()