f94ce408c6
Why I did it Fixed the monit container_checker fails due to unexpected "database-chassis" docker running on Supervisor card in the VOQ chassis. fixes #9042 How I did it Added database-chassis to the always running docker list if platform is supervisor card. How to verify it Execute the CLI command "sudo monit status container_checker" Signed-off-by: mlok <marty.lok@nokia.com>
165 lines
6.8 KiB
Python
Executable File
165 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
container_checker
|
|
|
|
This script is intended to be run by Monit. It will write an alerting message into
|
|
syslog if it found containers which were expected to run but were not running. At
|
|
the same time, if some containers were unexpected to run, it also writes an alerting
|
|
syslog message. Note that if print(...) statement in this script was executed, the
|
|
string in it will be appended to Monit syslog messages.
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
this script:
|
|
|
|
check program container_checker with path "/usr/bin/container_checker"
|
|
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
|
"""
|
|
|
|
import docker
|
|
import sys
|
|
|
|
import swsssdk
|
|
from sonic_py_common import multi_asic, device_info
|
|
from swsscommon import swsscommon
|
|
|
|
|
|
def get_expected_running_containers():
|
|
"""
|
|
@summary: This function will get the expected running & always-enabled containers by following the rule:
|
|
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
|
|
If the device has Multi-ASIC, this function will get container list by determining the
|
|
value of field 'has_global_scope', the number of ASICs and the value of field
|
|
'has_per_asic_scope'.
|
|
If the device has single ASIC, the container name was put into the list.
|
|
@return: A set which contains the expected running containers and a set that has
|
|
containers marked as "always_enabled".
|
|
"""
|
|
config_db = swsssdk.ConfigDBConnector()
|
|
config_db.connect()
|
|
feature_table = config_db.get_table("FEATURE")
|
|
|
|
expected_running_containers = set()
|
|
always_running_containers = set()
|
|
|
|
for container_name in feature_table.keys():
|
|
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
expected_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["state"] == 'always_enabled':
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
always_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
always_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
always_running_containers.add(container_name)
|
|
if device_info.is_supervisor():
|
|
always_running_containers.add("database-chassis")
|
|
return expected_running_containers, always_running_containers
|
|
|
|
def get_current_running_from_DB(always_running_containers):
|
|
"""
|
|
@summary: This function will get the current running container list
|
|
from FEATURE table @ STATE_DB, if this table is available.
|
|
@return: a tuple
|
|
First: Return value indicating if info can be obtained from
|
|
DB or not.
|
|
Second: A set which contains the current running containers,
|
|
if this info is available in DB.
|
|
"""
|
|
running_containers = set()
|
|
|
|
state_db = swsscommon.DBConnector("STATE_DB", 0)
|
|
tbl = swsscommon.Table(state_db, "FEATURE")
|
|
if not tbl.getKeys():
|
|
return False, None
|
|
|
|
for name in tbl.getKeys():
|
|
data = dict(tbl.get(name)[1])
|
|
if data.get('container_id'):
|
|
running_containers.add(name)
|
|
|
|
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
RUNNING = 'running'
|
|
for name in always_running_containers:
|
|
try:
|
|
container = DOCKER_CLIENT.containers.get(name)
|
|
container_state = container.attrs.get('State', {})
|
|
if container_state.get('Status', "") == RUNNING:
|
|
running_containers.add(name)
|
|
except (docker.errors.NotFound, docker.errors.APIError) as err:
|
|
print("Failed to get container '{}'. Error: '{}'".format(name, err))
|
|
pass
|
|
|
|
return True, running_containers
|
|
|
|
|
|
def get_current_running_from_dockers():
|
|
"""
|
|
@summary: This function will get all running containers from
|
|
the list of docker containers in running state.
|
|
@return: A set which contains containers that are
|
|
in running state.
|
|
"""
|
|
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
running_containers = set()
|
|
ctrs = DOCKER_CLIENT.containers
|
|
try:
|
|
lst = ctrs.list(filters={"status": "running"})
|
|
for ctr in lst:
|
|
running_containers.add(ctr.name)
|
|
except docker.errors.APIError as err:
|
|
print("Failed to retrieve the running container list. Error: '{}'".format(err))
|
|
pass
|
|
return running_containers
|
|
|
|
|
|
def get_current_running_containers(always_running_containers):
|
|
"""
|
|
@summary: This function will get the list of currently running containers.
|
|
If available in STATE-DB, get from DB else from list of dockers.
|
|
|
|
@return: A set of currently running containers.
|
|
"""
|
|
|
|
ret, current_running_containers = get_current_running_from_DB(always_running_containers)
|
|
if not ret:
|
|
current_running_containers = get_current_running_from_dockers()
|
|
return current_running_containers
|
|
|
|
|
|
def main():
|
|
"""
|
|
@summary: This function will compare the difference between the current running containers
|
|
and the containers which were expected to run. If containers which were exepcted
|
|
to run were not running, then an alerting message will be written into syslog.
|
|
"""
|
|
expected_running_containers, always_running_containers = get_expected_running_containers()
|
|
current_running_containers = get_current_running_containers(always_running_containers)
|
|
|
|
expected_running_containers |= always_running_containers
|
|
not_running_containers = expected_running_containers.difference(current_running_containers)
|
|
if not_running_containers:
|
|
print("Expected containers not running: " + ", ".join(not_running_containers))
|
|
sys.exit(3)
|
|
|
|
unexpected_running_containers = current_running_containers.difference(expected_running_containers)
|
|
if unexpected_running_containers:
|
|
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
|
|
sys.exit(4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
sys.exit(0)
|