bfec282a82
**- Why I did it** This PR aims to monitor the running status of each container. Currently the auto-restart feature was enabled. If a critical process exited unexpected, the container will be restarted. If the container was restarted 3 times during 20 minutes, then it will not run anymore unless we cleared the flag using the command `sudo systemctl reset-failed <container_name>` manually. **- How I did it** We will employ Monit to monitor a script. This script will generate the expected running container list and compare it with the current running containers. If there are containers which were expected to run but were not running, then an alerting message will be written into syslog. **- How to verify it** I tested this feature on a lab device `str-a7050-acs-3` which has single ASIC and `str2-n3164-acs-3` which has a Multi-ASIC. First I manually stopped a container by running the command `sudo systemctl stop <container_name>`, then I checked whether there was an alerting message in the syslog. Signed-off-by: Yong Zhao <yozhao@microsoft.com>
117 lines
4.5 KiB
Python
Executable File
117 lines
4.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
container_checker
|
|
|
|
This script is intended to be run by Monit. It will write an alerting message into
|
|
syslog if it found containers which were expected to run but were not running. At
|
|
the same time, if some containers were unexpected to run, it also writes an alerting
|
|
syslog message. Note that if print(...) statement in this script was executed, the
|
|
string in it will be appended to Monit syslog messages.
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
this script:
|
|
|
|
check program container_checker with path "/usr/bin/container_checker"
|
|
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
|
|
import swsssdk
|
|
from sonic_py_common import multi_asic
|
|
|
|
|
|
def get_command_result(command):
|
|
"""
|
|
@summary: This function will execute the command and return the resulting output.
|
|
@return: A string which contains the output of command.
|
|
"""
|
|
command_stdout = ""
|
|
|
|
try:
|
|
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
shell=True, universal_newlines=True)
|
|
command_stdout, command_stderr = proc_instance.communicate()
|
|
if proc_instance.returncode != 0:
|
|
print("Failed to execute the command '{}'. Return code: '{}'".format(
|
|
command, proc_instance.returncode))
|
|
sys.exit(1)
|
|
except (OSError, ValueError) as err:
|
|
print("Failed to execute the command '{}'. Error: '{}'".format(command, err))
|
|
sys.exit(2)
|
|
|
|
return command_stdout.rstrip().split("\n")
|
|
|
|
|
|
def get_expected_running_containers():
|
|
"""
|
|
@summary: This function will get the expected running containers by following the rule:
|
|
The 'state' field of container in 'FEATURE' table should not be 'disabled'. Then
|
|
if the device has Multi-ASIC, this function will get container list by determining the
|
|
value of field 'has_global_scope', the number of ASICs and the value of field
|
|
'has_per_asic_scope'. If the device has single ASIC, the container name was put into
|
|
the list.
|
|
@return: A set which contains the expected running containers.
|
|
"""
|
|
config_db = swsssdk.ConfigDBConnector()
|
|
config_db.connect()
|
|
feature_table = config_db.get_table("FEATURE")
|
|
|
|
expected_running_containers = set()
|
|
|
|
for container_name in feature_table.keys():
|
|
if feature_table[container_name]["state"] != "disabled":
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
expected_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
expected_running_containers.add(container_name)
|
|
|
|
return expected_running_containers
|
|
|
|
|
|
def get_current_running_containers():
|
|
"""
|
|
@summary: This function will get the current running container list by analyzing the
|
|
output of command `docker ps`.
|
|
@return: A set which contains the current running contianers.
|
|
"""
|
|
running_containers = set()
|
|
|
|
command = "docker ps"
|
|
command_stdout = get_command_result(command)
|
|
for line in command_stdout[1:]:
|
|
running_containers.add(line.split()[-1].strip())
|
|
|
|
return running_containers
|
|
|
|
|
|
def main():
|
|
"""
|
|
@summary: This function will compare the difference between the current running containers
|
|
and the containers which were expected to run. If containers which were exepcted
|
|
to run were not running, then an alerting message will be written into syslog.
|
|
"""
|
|
expected_running_containers = get_expected_running_containers()
|
|
current_running_containers = get_current_running_containers()
|
|
|
|
not_running_containers = expected_running_containers.difference(current_running_containers)
|
|
if not_running_containers:
|
|
print("Expected containers not running: " + ", ".join(not_running_containers))
|
|
sys.exit(3)
|
|
|
|
unexpected_running_containers = current_running_containers.difference(expected_running_containers)
|
|
if unexpected_running_containers:
|
|
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
|
|
sys.exit(4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|