09fe3f467f
Add events for dhcp-relay, bgp, syncd, & kernel.
192 lines
8.0 KiB
Python
Executable File
192 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
container_checker
|
|
|
|
This script is intended to be run by Monit. It will write an alerting message into
|
|
syslog if it found containers which were expected to run but were not running. At
|
|
the same time, if some containers were unexpected to run, it also writes an alerting
|
|
syslog message. Note that if print(...) statement in this script was executed, the
|
|
string in it will be appended to Monit syslog messages.
|
|
|
|
The following is an example in Monit configuration file to show how Monit will run
|
|
this script:
|
|
|
|
check program container_checker with path "/usr/bin/container_checker"
|
|
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
|
|
"""
|
|
|
|
import docker
|
|
import sys
|
|
|
|
from sonic_py_common import multi_asic, device_info
|
|
from swsscommon import swsscommon
|
|
|
|
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
|
EVENTS_PUBLISHER_TAG = "event-down-ctr"
|
|
|
|
def get_expected_running_containers():
|
|
"""
|
|
@summary: This function will get the expected running & always-enabled containers by following the rule:
|
|
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
|
|
If the device has Multi-ASIC, this function will get container list by determining the
|
|
value of field 'has_global_scope', the number of ASICs and the value of field
|
|
'has_per_asic_scope'.
|
|
If the device has single ASIC, the container name was put into the list.
|
|
@return: A set which contains the expected running containers and a set that has
|
|
containers marked as "always_enabled".
|
|
"""
|
|
config_db = swsscommon.ConfigDBConnector()
|
|
config_db.connect()
|
|
feature_table = config_db.get_table("FEATURE")
|
|
|
|
expected_running_containers = set()
|
|
always_running_containers = set()
|
|
|
|
# Get current asic presence list. For multi_asic system, multi instance containers
|
|
# should be checked only for asics present.
|
|
asics_id_presence = multi_asic.get_asic_presence_list()
|
|
|
|
# Some services may run all the instances irrespective of asic presence.
|
|
# Add those to exception list.
|
|
# database service: Currently services have dependency on all database services to
|
|
# be up irrespective of asic presence.
|
|
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
|
|
# it will be removed from exception list.
|
|
run_all_instance_list = ['database', 'bgp']
|
|
|
|
for container_name in feature_table.keys():
|
|
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
if asic_id in asics_id_presence or container_name in run_all_instance_list:
|
|
expected_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
expected_running_containers.add(container_name)
|
|
if feature_table[container_name]["state"] == 'always_enabled':
|
|
if multi_asic.is_multi_asic():
|
|
if feature_table[container_name]["has_global_scope"] == "True":
|
|
always_running_containers.add(container_name)
|
|
if feature_table[container_name]["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
if asic_id in asics_id_presence or container_name in run_all_instance_list:
|
|
always_running_containers.add(container_name + str(asic_id))
|
|
else:
|
|
always_running_containers.add(container_name)
|
|
|
|
if device_info.is_supervisor():
|
|
always_running_containers.add("database-chassis")
|
|
return expected_running_containers, always_running_containers
|
|
|
|
def get_current_running_from_DB(always_running_containers):
|
|
"""
|
|
@summary: This function will get the current running container list
|
|
from FEATURE table @ STATE_DB, if this table is available.
|
|
@return: a tuple
|
|
First: Return value indicating if info can be obtained from
|
|
DB or not.
|
|
Second: A set which contains the current running containers,
|
|
if this info is available in DB.
|
|
"""
|
|
running_containers = set()
|
|
|
|
state_db = swsscommon.DBConnector("STATE_DB", 0)
|
|
tbl = swsscommon.Table(state_db, "FEATURE")
|
|
if not tbl.getKeys():
|
|
return running_containers
|
|
|
|
for name in tbl.getKeys():
|
|
data = dict(tbl.get(name)[1])
|
|
if data.get('container_id'):
|
|
running_containers.add(name)
|
|
|
|
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
RUNNING = 'running'
|
|
for name in always_running_containers:
|
|
try:
|
|
container = DOCKER_CLIENT.containers.get(name)
|
|
container_state = container.attrs.get('State', {})
|
|
if container_state.get('Status', "") == RUNNING:
|
|
running_containers.add(name)
|
|
except (docker.errors.NotFound, docker.errors.APIError) as err:
|
|
print("Failed to get container '{}'. Error: '{}'".format(name, err))
|
|
pass
|
|
|
|
return running_containers
|
|
|
|
|
|
def get_current_running_from_dockers():
|
|
"""
|
|
@summary: This function will get all running containers from
|
|
the list of docker containers in running state.
|
|
@return: A set which contains containers that are
|
|
in running state.
|
|
"""
|
|
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
running_containers = set()
|
|
ctrs = DOCKER_CLIENT.containers
|
|
try:
|
|
lst = ctrs.list(filters={"status": "running"})
|
|
for ctr in lst:
|
|
running_containers.add(ctr.name)
|
|
except docker.errors.APIError as err:
|
|
print("Failed to retrieve the running container list. Error: '{}'".format(err))
|
|
pass
|
|
return running_containers
|
|
|
|
|
|
def get_current_running_containers(always_running_containers):
|
|
"""
|
|
@summary: This function will get the list of currently running containers.
|
|
If available in STATE-DB, get from DB else from list of dockers.
|
|
|
|
@return: A set of currently running containers.
|
|
"""
|
|
|
|
current_running_containers = get_current_running_from_DB(always_running_containers)
|
|
current_running_containers.update(get_current_running_from_dockers())
|
|
return current_running_containers
|
|
|
|
|
|
def publish_events(lst):
|
|
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
|
params = swsscommon.FieldValueMap()
|
|
|
|
for ctr in lst:
|
|
params["ctr_name"] = ctr;
|
|
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
|
|
|
|
swsscommon.events_deinit_publisher(events_handle)
|
|
|
|
|
|
def main():
|
|
"""
|
|
@summary: This function will compare the difference between the current running containers
|
|
and the containers which were expected to run. If containers which were exepcted
|
|
to run were not running, then an alerting message will be written into syslog.
|
|
"""
|
|
expected_running_containers, always_running_containers = get_expected_running_containers()
|
|
current_running_containers = get_current_running_containers(always_running_containers)
|
|
|
|
expected_running_containers |= always_running_containers
|
|
not_running_containers = expected_running_containers.difference(current_running_containers)
|
|
if not_running_containers:
|
|
publish_events(not_running_containers)
|
|
print("Expected containers not running: " + ", ".join(not_running_containers))
|
|
sys.exit(3)
|
|
|
|
unexpected_running_containers = current_running_containers.difference(expected_running_containers)
|
|
if unexpected_running_containers:
|
|
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
|
|
sys.exit(4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
sys.exit(0)
|