2737c9681f
Signed-off-by: Yong Zhao yozhao@microsoft.com Why I did it Since we introduced a new value always_disabled for the state field in FEATURE table, the expected running container list should exclude the always_diabled containers. This bug was found by nightly test and posted at here: issue. This PR fixes #7210. How I did it I added a logic condition to decide whether the value of state field of a container was always_disabled or not. How to verify it I verified this on the device str-dx010-acs-1. Which release branch to backport (provide reason below if selected) 201811 201911 202006 [ x] 202012
117 lines
4.5 KiB
Python
Executable File
117 lines
4.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
container_checker
|
|
|
|
This script is intended to be run by Monit. It will write an alerting message into
|
|
syslog if it found containers which were expected to run but were not running. At
|
|
the same time, if some containers were unexpected to run, it also writes an alerting
|
|
syslog message. Note that if print(...) statement in this script was executed, the
|
|
string in it will be appended to Monit syslog messages.
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
this script:
|
|
|
|
check program container_checker with path "/usr/bin/container_checker"
|
|
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
|
|
import swsssdk
|
|
from sonic_py_common import multi_asic
|
|
|
|
|
|
def get_command_result(command):
|
|
"""
|
|
@summary: This function will execute the command and return the resulting output.
|
|
@return: A string which contains the output of command.
|
|
"""
|
|
command_stdout = ""
|
|
|
|
try:
|
|
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
shell=True, universal_newlines=True)
|
|
command_stdout, command_stderr = proc_instance.communicate()
|
|
if proc_instance.returncode != 0:
|
|
print("Failed to execute the command '{}'. Return code: '{}'".format(
|
|
command, proc_instance.returncode))
|
|
sys.exit(1)
|
|
except (OSError, ValueError) as err:
|
|
print("Failed to execute the command '{}'. Error: '{}'".format(command, err))
|
|
sys.exit(2)
|
|
|
|
return command_stdout.rstrip().split("\n")
|
|
|
|
|
|
def get_expected_running_containers():
|
|
"""
|
|
@summary: This function will get the expected running containers by following the rule:
|
|
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
|
|
If the device has Multi-ASIC, this function will get container list by determining the
|
|
value of field 'has_global_scope', the number of ASICs and the value of field
|
|
'has_per_asic_scope'.
|
|
If the device has single ASIC, the container name was put into the list.
|
|
@return: A set which contains the expected running containers.
|
|
"""
|
|
config_db = swsssdk.ConfigDBConnector()
|
|
config_db.connect()
|
|
feature_table = config_db.get_table("FEATURE")
|
|
|
|
expected_running_containers = set()
|
|
|
|
for container_name in feature_table.keys():
|
|
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
expected_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
expected_running_containers.add(container_name)
|
|
|
|
return expected_running_containers
|
|
|
|
|
|
def get_current_running_containers():
|
|
"""
|
|
@summary: This function will get the current running container list by analyzing the
|
|
output of command `docker ps`.
|
|
@return: A set which contains the current running contianers.
|
|
"""
|
|
running_containers = set()
|
|
|
|
command = "docker ps"
|
|
command_stdout = get_command_result(command)
|
|
for line in command_stdout[1:]:
|
|
running_containers.add(line.split()[-1].strip())
|
|
|
|
return running_containers
|
|
|
|
|
|
def main():
|
|
"""
|
|
@summary: This function will compare the difference between the current running containers
|
|
and the containers which were expected to run. If containers which were exepcted
|
|
to run were not running, then an alerting message will be written into syslog.
|
|
"""
|
|
expected_running_containers = get_expected_running_containers()
|
|
current_running_containers = get_current_running_containers()
|
|
|
|
not_running_containers = expected_running_containers.difference(current_running_containers)
|
|
if not_running_containers:
|
|
print("Expected containers not running: " + ", ".join(not_running_containers))
|
|
sys.exit(3)
|
|
|
|
unexpected_running_containers = current_running_containers.difference(expected_running_containers)
|
|
if unexpected_running_containers:
|
|
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
|
|
sys.exit(4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|