Add asic presence filtering for container checking in system-health (#13497)

Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity. How I did it Port container_checker logic from #11442 into service_checker for system-health. How to verify it Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present.
2023-02-10 21:34:10 -08:00 · 2023-02-10 21:34:10 -08:00 · ad679a0338
commit ad679a0338
parent f0f7639fa2
1 changed files with 18 additions and 4 deletions
--- a/src/system-health/health_checker/service_checker.py
+++ b/src/system-health/health_checker/service_checker.py
@ -71,6 +71,19 @@ class ServiceChecker(HealthChecker):
        """
        expected_running_containers = set()
        container_feature_dict = {}
+
+        # Get current asic presence list. For multi_asic system, multi instance containers
+        # should be checked only for asics present.
+        asics_id_presence = multi_asic.get_asic_presence_list()
+
+        # Some services may run all the instances irrespective of asic presence.
+        # Add those to exception list.
+        # database service: Currently services have dependency on all database services to
+        # be up irrespective of asic presence.
+        # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
+        # it will be removed from exception list.
+        run_all_instance_list = ['database', 'bgp']
+
        for feature_name, feature_entry in feature_table.items():
            if feature_entry["state"] not in ["disabled", "always_disabled"]:
                if multi_asic.is_multi_asic():
@ -80,8 +93,9 @@ class ServiceChecker(HealthChecker):
                    if feature_entry["has_per_asic_scope"] == "True":
                        num_asics = multi_asic.get_num_asics()
                        for asic_id in range(num_asics):
-                            expected_running_containers.add(feature_name + str(asic_id))
-                            container_feature_dict[feature_name + str(asic_id)] = feature_name
+                            if asic_id in asics_id_presence or feature_name in run_all_instance_list:
+                                expected_running_containers.add(feature_name + str(asic_id))
+                                container_feature_dict[feature_name + str(asic_id)] = feature_name
                else:
                    expected_running_containers.add(feature_name)
                    container_feature_dict[feature_name] = feature_name
@ -342,7 +356,7 @@ class ServiceChecker(HealthChecker):
                process_status = utils.run_command(cmd)
                if process_status is None:
                    for process_name in critical_process_list:
-                        self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
+                        self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
                    self.publish_events(container_name, critical_process_list)
                    return

@ -355,6 +369,6 @@ class ServiceChecker(HealthChecker):
                    # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
                    if process_name in process_status:
                        if process_status[process_name] != 'RUNNING':
-                            self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
+                            self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
                        else:
                            self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))