From f404ce60e01214f7eec7a2d36946b3c5bad3eb11 Mon Sep 17 00:00:00 2001 From: anamehra <54692434+anamehra@users.noreply.github.com> Date: Mon, 22 Aug 2022 10:08:29 -0700 Subject: [PATCH] container_checker on supervisor should check containers based on asic presence (#11442) Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. The monit 'container_checker' fails in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). --- files/image_config/monit/container_checker | 22 ++++++++++--- .../sonic_py_common/multi_asic.py | 31 ++++++++++++++++++- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker index a67a96a0c1..c6271d26c8 100755 --- a/files/image_config/monit/container_checker +++ b/files/image_config/monit/container_checker @@ -23,7 +23,6 @@ import swsssdk from sonic_py_common import multi_asic, device_info from swsscommon import swsscommon - def get_expected_running_containers(): """ @summary: This function will get the expected running & always-enabled containers by following the rule: @@ -41,7 +40,19 @@ def get_expected_running_containers(): expected_running_containers = set() always_running_containers = set() - + + # Get current asic presence list. For multi_asic system, multi instance containers + # should be checked only for asics present. + asics_id_presence = multi_asic.get_asic_presence_list() + + # Some services may run all the instances irrespective of asic presence. + # Add those to exception list. + # database service: Currently services have dependency on all database services to + # be up irrespective of asic presence. + # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven, + # it will be removed from exception list. + run_all_instance_list = ['database', 'bgp'] + for container_name in feature_table.keys(): if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): @@ -50,7 +61,8 @@ def get_expected_running_containers(): if feature_table[container_name]["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - expected_running_containers.add(container_name + str(asic_id)) + if asic_id in asics_id_presence or container_name in run_all_instance_list: + expected_running_containers.add(container_name + str(asic_id)) else: expected_running_containers.add(container_name) if feature_table[container_name]["state"] == 'always_enabled': @@ -60,9 +72,11 @@ def get_expected_running_containers(): if feature_table[container_name]["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - always_running_containers.add(container_name + str(asic_id)) + if asic_id in asics_id_presence or container_name in run_all_instance_list: + always_running_containers.add(container_name + str(asic_id)) else: always_running_containers.add(container_name) + if device_info.is_supervisor(): always_running_containers.add("database-chassis") return expected_running_containers, always_running_containers diff --git a/src/sonic-py-common/sonic_py_common/multi_asic.py b/src/sonic-py-common/sonic_py_common/multi_asic.py index 8ba409165f..e08746be03 100644 --- a/src/sonic-py-common/sonic_py_common/multi_asic.py +++ b/src/sonic-py-common/sonic_py_common/multi_asic.py @@ -22,7 +22,8 @@ BGP_INTERNAL_NEIGH_CFG_DB_TABLE = 'BGP_INTERNAL_NEIGHBOR' NEIGH_DEVICE_METADATA_CFG_DB_TABLE = 'DEVICE_NEIGHBOR_METADATA' DEFAULT_NAMESPACE = '' PORT_ROLE = 'role' - +CHASSIS_STATE_DB='CHASSIS_STATE_DB' +CHASSIS_ASIC_INFO_TABLE='CHASSIS_ASIC_TABLE' # Dictionary to cache config_db connection handle per namespace # to prevent duplicate connections from being opened @@ -451,3 +452,31 @@ def validate_namespace(namespace): return True else: return False + +def get_asic_presence_list(): + """ + @summary: This function will get the asic presence list. On Supervisor, the list includes only the asics + for inserted and detected fabric cards. For non-supervisor cards, e.g. line card, the list should + contain all supported asics by the card. The function gets the asic list from CHASSIS_ASIC_TABLE from + CHASSIS_STATE_DB. The function assumes that the first N asic ids (asic0 to asic(N-1)) in + CHASSIS_ASIC_TABLE belongs to the supervisor, where N is the max number of asics supported by the Chassis + @return: List of asics present + """ + asics_list = [] + if is_multi_asic(): + if not is_supervisor(): + # This is not supervisor, all asics should be present. Assuming that asics + # are not removable entity on Line Cards. Add all asics, 0 - num_asics to the list. + asics_list = list(range(0, get_num_asics())) + else: + # This is supervisor card. Some fabric cards may not be inserted. + # Get asic list from CHASSIS_ASIC_TABLE which lists only the asics + # present based on Fabric card detection by the platform. + db = swsscommon.DBConnector(CHASSIS_STATE_DB, 0, True) + asic_table = swsscommon.Table(db, CHASSIS_ASIC_INFO_TABLE) + if asic_table: + asics_presence_list = list(asic_table.getKeys()) + for asic in asics_presence_list: + # asic is asid id: asic0, asic1.... asicN. Get the numeric value. + asics_list.append(int(get_asic_id_from_name(asic))) + return asics_list