Add asic presence filtering for container checking in system-health (#13497)
Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity. How I did it Port container_checker logic from #11442 into service_checker for system-health. How to verify it Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present.
This commit is contained in:
parent
f0f7639fa2
commit
ad679a0338
@ -71,6 +71,19 @@ class ServiceChecker(HealthChecker):
|
|||||||
"""
|
"""
|
||||||
expected_running_containers = set()
|
expected_running_containers = set()
|
||||||
container_feature_dict = {}
|
container_feature_dict = {}
|
||||||
|
|
||||||
|
# Get current asic presence list. For multi_asic system, multi instance containers
|
||||||
|
# should be checked only for asics present.
|
||||||
|
asics_id_presence = multi_asic.get_asic_presence_list()
|
||||||
|
|
||||||
|
# Some services may run all the instances irrespective of asic presence.
|
||||||
|
# Add those to exception list.
|
||||||
|
# database service: Currently services have dependency on all database services to
|
||||||
|
# be up irrespective of asic presence.
|
||||||
|
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
|
||||||
|
# it will be removed from exception list.
|
||||||
|
run_all_instance_list = ['database', 'bgp']
|
||||||
|
|
||||||
for feature_name, feature_entry in feature_table.items():
|
for feature_name, feature_entry in feature_table.items():
|
||||||
if feature_entry["state"] not in ["disabled", "always_disabled"]:
|
if feature_entry["state"] not in ["disabled", "always_disabled"]:
|
||||||
if multi_asic.is_multi_asic():
|
if multi_asic.is_multi_asic():
|
||||||
@ -80,8 +93,9 @@ class ServiceChecker(HealthChecker):
|
|||||||
if feature_entry["has_per_asic_scope"] == "True":
|
if feature_entry["has_per_asic_scope"] == "True":
|
||||||
num_asics = multi_asic.get_num_asics()
|
num_asics = multi_asic.get_num_asics()
|
||||||
for asic_id in range(num_asics):
|
for asic_id in range(num_asics):
|
||||||
expected_running_containers.add(feature_name + str(asic_id))
|
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
|
||||||
container_feature_dict[feature_name + str(asic_id)] = feature_name
|
expected_running_containers.add(feature_name + str(asic_id))
|
||||||
|
container_feature_dict[feature_name + str(asic_id)] = feature_name
|
||||||
else:
|
else:
|
||||||
expected_running_containers.add(feature_name)
|
expected_running_containers.add(feature_name)
|
||||||
container_feature_dict[feature_name] = feature_name
|
container_feature_dict[feature_name] = feature_name
|
||||||
@ -342,7 +356,7 @@ class ServiceChecker(HealthChecker):
|
|||||||
process_status = utils.run_command(cmd)
|
process_status = utils.run_command(cmd)
|
||||||
if process_status is None:
|
if process_status is None:
|
||||||
for process_name in critical_process_list:
|
for process_name in critical_process_list:
|
||||||
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
|
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
|
||||||
self.publish_events(container_name, critical_process_list)
|
self.publish_events(container_name, critical_process_list)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -355,6 +369,6 @@ class ServiceChecker(HealthChecker):
|
|||||||
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
|
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
|
||||||
if process_name in process_status:
|
if process_name in process_status:
|
||||||
if process_status[process_name] != 'RUNNING':
|
if process_status[process_name] != 'RUNNING':
|
||||||
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
|
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
|
||||||
else:
|
else:
|
||||||
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))
|
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))
|
||||||
|
Loading…
Reference in New Issue
Block a user