[Monit] Restart telemetry container if memory usage is beyond the threshold (#7645)
Signed-off-by: Yong Zhao yozhao@microsoft.com Why I did it This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold. How I did it I borrowed the system tool Monit to run a script memory_checker which will periodically check the memory usage of streaming telemetry container. If the memory usage of telemetry container is larger than the pre-defined threshold for 10 times during 20 cycles, then an alerting message will be written into syslog and at the same time Monit will run the script restart_service to restart the streaming telemetry container. How to verify it I verified this implementation on device str-7260cx3-acs-1.
This commit is contained in:
parent
0c5c4872dc
commit
37863ac854
@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry
|
||||
|
||||
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
|
||||
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
||||
|
||||
check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
|
||||
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"
|
||||
|
@ -334,6 +334,10 @@ sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
|
||||
sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker
|
||||
sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker
|
||||
sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service
|
||||
|
||||
|
||||
# Install custom-built openssh sshd
|
||||
|
109
files/image_config/monit/memory_checker
Executable file
109
files/image_config/monit/memory_checker
Executable file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
memory_checker
|
||||
|
||||
This script is part of the feature which will restart the container if memory
|
||||
usage of it is larger than the threshold value.
|
||||
|
||||
This script is used to check the memory usage of specified cotnainer and
|
||||
is intended to be run by Monit. It will write an alerting message into
|
||||
syslog if memory usage of the container is larger than the threshold value for X
|
||||
times within Y cycles/minutes. Note that if print(...) statement in this script
|
||||
was executed, the string in it will be appended to Monit syslog messages.
|
||||
|
||||
The following is an example in Monit configuration file to show how Monit will run
|
||||
this script:
|
||||
|
||||
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
|
||||
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import syslog
|
||||
import re
|
||||
|
||||
|
||||
def get_command_result(command):
|
||||
"""Executes the command and return the resulting output.
|
||||
|
||||
Args:
|
||||
command: A string contains the command to be executed.
|
||||
|
||||
Returns:
|
||||
A string which contains the output of command.
|
||||
"""
|
||||
command_stdout = ""
|
||||
|
||||
try:
|
||||
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
shell=True, universal_newlines=True)
|
||||
command_stdout, command_stderr = proc_instance.communicate()
|
||||
if proc_instance.returncode != 0:
|
||||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
|
||||
.format(command, proc_instance.returncode))
|
||||
sys.exit(1)
|
||||
except (OSError, ValueError) as err:
|
||||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
|
||||
.format(command, err))
|
||||
sys.exit(2)
|
||||
|
||||
return command_stdout.strip()
|
||||
|
||||
|
||||
def check_memory_usage(container_name, threshold_value):
|
||||
"""Checks the memory usage of a container and writes an alerting messages into
|
||||
the syslog if the memory usage is larger than the threshold value.
|
||||
|
||||
Args:
|
||||
container_name: A string represtents name of a container
|
||||
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
|
||||
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
|
||||
command_stdout = get_command_result(command)
|
||||
mem_usage = command_stdout.split("/")[0].strip()
|
||||
match_obj = re.match(r"\d+\.?\d*", mem_usage)
|
||||
if match_obj:
|
||||
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
|
||||
mem_usage_unit = mem_usage[match_obj.end():]
|
||||
|
||||
mem_usage_bytes = 0.0
|
||||
if mem_usage_unit == "B":
|
||||
mem_usage_bytes = mem_usage_value
|
||||
elif mem_usage_unit == "KiB":
|
||||
mem_usage_bytes = mem_usage_value * 1024
|
||||
elif mem_usage_unit == "MiB":
|
||||
mem_usage_bytes = mem_usage_value * 1024 ** 2
|
||||
elif mem_usage_unit == "GiB":
|
||||
mem_usage_bytes = mem_usage_value * 1024 ** 3
|
||||
|
||||
if mem_usage_bytes > threshold_value:
|
||||
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
|
||||
.format(container_name, mem_usage_bytes, threshold_value))
|
||||
sys.exit(3)
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
|
||||
.format(mem_usage))
|
||||
sys.exit(4)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Check memory usage of a container \
|
||||
and an alerting message will be written into syslog if memory usage \
|
||||
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
|
||||
parser.add_argument("container_name", help="container name")
|
||||
# TODO: Currently the threshold value is hard coded as a command line argument and will
|
||||
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
|
||||
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
|
||||
args = parser.parse_args()
|
||||
|
||||
check_memory_usage(args.container_name, args.threshold_value)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
105
files/image_config/monit/restart_service
Executable file
105
files/image_config/monit/restart_service
Executable file
@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
restart_service
|
||||
|
||||
This script is part of the feature which will restart the container if memory
|
||||
usage of it is larger than the threshold value.
|
||||
|
||||
This script is intended to be run by Monit and is used to restart the specified
|
||||
container if the memory usage of it is larger than the threshold value for X
|
||||
times within Y cycles/minutes.
|
||||
|
||||
The following is an example in Monit configuration file to show how Monit will run
|
||||
this script:
|
||||
|
||||
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
|
||||
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import syslog
|
||||
import subprocess
|
||||
|
||||
|
||||
def get_command_result(command):
|
||||
"""Executes command and return the exit code, stdout and stderr.
|
||||
|
||||
Args:
|
||||
command: A string contains the command to be executed.
|
||||
|
||||
Returns:
|
||||
An integer contains the exit code.
|
||||
A string contains the output of stdout.
|
||||
A string contains the output of stderr.
|
||||
"""
|
||||
command_stdout = ""
|
||||
command_stderr = ""
|
||||
|
||||
try:
|
||||
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
shell=True, universal_newlines=True)
|
||||
command_stdout, command_stderr = proc_instance.communicate()
|
||||
if proc_instance.returncode != 0:
|
||||
return 1, command_stdout.strip(), command_stderr.strip()
|
||||
except (OSError, ValueError) as err:
|
||||
return 2, command_stdout.strip(), err
|
||||
|
||||
return 0, command_stdout.strip(), command_stderr.strip()
|
||||
|
||||
|
||||
def reset_failed_flag(service_name):
|
||||
"""Reset the failed status of a service.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name)
|
||||
|
||||
syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..."
|
||||
.format(service_name))
|
||||
|
||||
exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command)
|
||||
if exit_code == 0:
|
||||
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'."
|
||||
.format(service_name))
|
||||
else:
|
||||
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}"
|
||||
.format(service_name, command_stderr))
|
||||
|
||||
|
||||
def restart_service(service_name):
|
||||
"""Reset the failed status of a service and then restart it.
|
||||
|
||||
Args:
|
||||
service_name: Name of specified service.
|
||||
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
restart_command = "sudo systemctl restart {}.service".format(service_name)
|
||||
|
||||
reset_failed_flag(service_name)
|
||||
|
||||
syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name))
|
||||
exit_code, command_stdout, command_stderr = get_command_result(restart_command)
|
||||
if exit_code != 0:
|
||||
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}"
|
||||
.format(service_name, command_stderr))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Restart a specific service",
|
||||
usage="/usr/bin/restart_service <service_name>")
|
||||
parser.add_argument("service_name", help="service name")
|
||||
args = parser.parse_args()
|
||||
|
||||
restart_service(args.service_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user