sonic-buildimage/files/image_config/monit/container_checker
Marty Y. Lok f94ce408c6 [chassis][supervisor]monit container-checker failed due to unexpected "database-chassis" docker running #9042 (#9043)
Why I did it
Fixed the monit container_checker fails due to unexpected "database-chassis" docker running on Supervisor card in the VOQ chassis. fixes #9042

How I did it
Added database-chassis to the always running docker list if platform is supervisor card.

How to verify it
Execute the CLI command "sudo monit status container_checker"


Signed-off-by: mlok <marty.lok@nokia.com>
2022-03-04 21:49:52 +00:00

165 lines
6.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
container_checker
This script is intended to be run by Monit. It will write an alerting message into
syslog if it found containers which were expected to run but were not running. At
the same time, if some containers were unexpected to run, it also writes an alerting
syslog message. Note that if print(...) statement in this script was executed, the
string in it will be appended to Monit syslog messages.
The following is an example in Monit configuration file to show how Monit will run
this script:
check program container_checker with path "/usr/bin/container_checker"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
"""
import docker
import sys
import swsssdk
from sonic_py_common import multi_asic, device_info
from swsscommon import swsscommon
def get_expected_running_containers():
"""
@summary: This function will get the expected running & always-enabled containers by following the rule:
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
If the device has Multi-ASIC, this function will get container list by determining the
value of field 'has_global_scope', the number of ASICs and the value of field
'has_per_asic_scope'.
If the device has single ASIC, the container name was put into the list.
@return: A set which contains the expected running containers and a set that has
containers marked as "always_enabled".
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")
expected_running_containers = set()
always_running_containers = set()
for container_name in feature_table.keys():
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
if feature_table[container_name]["has_global_scope"] == "True":
expected_running_containers.add(container_name)
if feature_table[container_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(container_name + str(asic_id))
else:
expected_running_containers.add(container_name)
if feature_table[container_name]["state"] == 'always_enabled':
if multi_asic.is_multi_asic():
if feature_table[container_name]["has_global_scope"] == "True":
always_running_containers.add(container_name)
if feature_table[container_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
always_running_containers.add(container_name + str(asic_id))
else:
always_running_containers.add(container_name)
if device_info.is_supervisor():
always_running_containers.add("database-chassis")
return expected_running_containers, always_running_containers
def get_current_running_from_DB(always_running_containers):
"""
@summary: This function will get the current running container list
from FEATURE table @ STATE_DB, if this table is available.
@return: a tuple
First: Return value indicating if info can be obtained from
DB or not.
Second: A set which contains the current running containers,
if this info is available in DB.
"""
running_containers = set()
state_db = swsscommon.DBConnector("STATE_DB", 0)
tbl = swsscommon.Table(state_db, "FEATURE")
if not tbl.getKeys():
return False, None
for name in tbl.getKeys():
data = dict(tbl.get(name)[1])
if data.get('container_id'):
running_containers.add(name)
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
RUNNING = 'running'
for name in always_running_containers:
try:
container = DOCKER_CLIENT.containers.get(name)
container_state = container.attrs.get('State', {})
if container_state.get('Status', "") == RUNNING:
running_containers.add(name)
except (docker.errors.NotFound, docker.errors.APIError) as err:
print("Failed to get container '{}'. Error: '{}'".format(name, err))
pass
return True, running_containers
def get_current_running_from_dockers():
"""
@summary: This function will get all running containers from
the list of docker containers in running state.
@return: A set which contains containers that are
in running state.
"""
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
running_containers = set()
ctrs = DOCKER_CLIENT.containers
try:
lst = ctrs.list(filters={"status": "running"})
for ctr in lst:
running_containers.add(ctr.name)
except docker.errors.APIError as err:
print("Failed to retrieve the running container list. Error: '{}'".format(err))
pass
return running_containers
def get_current_running_containers(always_running_containers):
"""
@summary: This function will get the list of currently running containers.
If available in STATE-DB, get from DB else from list of dockers.
@return: A set of currently running containers.
"""
ret, current_running_containers = get_current_running_from_DB(always_running_containers)
if not ret:
current_running_containers = get_current_running_from_dockers()
return current_running_containers
def main():
"""
@summary: This function will compare the difference between the current running containers
and the containers which were expected to run. If containers which were exepcted
to run were not running, then an alerting message will be written into syslog.
"""
expected_running_containers, always_running_containers = get_expected_running_containers()
current_running_containers = get_current_running_containers(always_running_containers)
expected_running_containers |= always_running_containers
not_running_containers = expected_running_containers.difference(current_running_containers)
if not_running_containers:
print("Expected containers not running: " + ", ".join(not_running_containers))
sys.exit(3)
unexpected_running_containers = current_running_containers.difference(expected_running_containers)
if unexpected_running_containers:
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
sys.exit(4)
if __name__ == "__main__":
main()
sys.exit(0)