container_checker on supervisor should check containers based on asic presence (#11442)

Why I did it
On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created.

The monit 'container_checker' fails in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file).
This commit is contained in:
anamehra 2022-08-22 10:08:29 -07:00 committed by GitHub
parent 2d4ab9e979
commit f404ce60e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 5 deletions

View File

@ -23,7 +23,6 @@ import swsssdk
from sonic_py_common import multi_asic, device_info
from swsscommon import swsscommon
def get_expected_running_containers():
"""
@summary: This function will get the expected running & always-enabled containers by following the rule:
@ -42,6 +41,18 @@ def get_expected_running_containers():
expected_running_containers = set()
always_running_containers = set()
# Get current asic presence list. For multi_asic system, multi instance containers
# should be checked only for asics present.
asics_id_presence = multi_asic.get_asic_presence_list()
# Some services may run all the instances irrespective of asic presence.
# Add those to exception list.
# database service: Currently services have dependency on all database services to
# be up irrespective of asic presence.
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
# it will be removed from exception list.
run_all_instance_list = ['database', 'bgp']
for container_name in feature_table.keys():
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
@ -50,6 +61,7 @@ def get_expected_running_containers():
if feature_table[container_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
if asic_id in asics_id_presence or container_name in run_all_instance_list:
expected_running_containers.add(container_name + str(asic_id))
else:
expected_running_containers.add(container_name)
@ -60,9 +72,11 @@ def get_expected_running_containers():
if feature_table[container_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
if asic_id in asics_id_presence or container_name in run_all_instance_list:
always_running_containers.add(container_name + str(asic_id))
else:
always_running_containers.add(container_name)
if device_info.is_supervisor():
always_running_containers.add("database-chassis")
return expected_running_containers, always_running_containers

View File

@ -22,7 +22,8 @@ BGP_INTERNAL_NEIGH_CFG_DB_TABLE = 'BGP_INTERNAL_NEIGHBOR'
NEIGH_DEVICE_METADATA_CFG_DB_TABLE = 'DEVICE_NEIGHBOR_METADATA'
DEFAULT_NAMESPACE = ''
PORT_ROLE = 'role'
CHASSIS_STATE_DB='CHASSIS_STATE_DB'
CHASSIS_ASIC_INFO_TABLE='CHASSIS_ASIC_TABLE'
# Dictionary to cache config_db connection handle per namespace
# to prevent duplicate connections from being opened
@ -451,3 +452,31 @@ def validate_namespace(namespace):
return True
else:
return False
def get_asic_presence_list():
"""
@summary: This function will get the asic presence list. On Supervisor, the list includes only the asics
for inserted and detected fabric cards. For non-supervisor cards, e.g. line card, the list should
contain all supported asics by the card. The function gets the asic list from CHASSIS_ASIC_TABLE from
CHASSIS_STATE_DB. The function assumes that the first N asic ids (asic0 to asic(N-1)) in
CHASSIS_ASIC_TABLE belongs to the supervisor, where N is the max number of asics supported by the Chassis
@return: List of asics present
"""
asics_list = []
if is_multi_asic():
if not is_supervisor():
# This is not supervisor, all asics should be present. Assuming that asics
# are not removable entity on Line Cards. Add all asics, 0 - num_asics to the list.
asics_list = list(range(0, get_num_asics()))
else:
# This is supervisor card. Some fabric cards may not be inserted.
# Get asic list from CHASSIS_ASIC_TABLE which lists only the asics
# present based on Fabric card detection by the platform.
db = swsscommon.DBConnector(CHASSIS_STATE_DB, 0, True)
asic_table = swsscommon.Table(db, CHASSIS_ASIC_INFO_TABLE)
if asic_table:
asics_presence_list = list(asic_table.getKeys())
for asic in asics_presence_list:
# asic is asid id: asic0, asic1.... asicN. Get the numeric value.
asics_list.append(int(get_asic_id_from_name(asic)))
return asics_list