Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity. How I did it Port container_checker logic from #11442 into service_checker for system-health. How to verify it Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present.
375 lines
16 KiB
Python
375 lines
16 KiB
Python
import docker
|
|
import os
|
|
import pickle
|
|
import re
|
|
|
|
from swsscommon import swsscommon
|
|
from sonic_py_common import multi_asic
|
|
from sonic_py_common.logger import Logger
|
|
from .health_checker import HealthChecker
|
|
from . import utils
|
|
|
|
SYSLOG_IDENTIFIER = 'service_checker'
|
|
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
|
|
|
|
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
|
|
EVENTS_PUBLISHER_TAG = "process-not-running"
|
|
|
|
class ServiceChecker(HealthChecker):
|
|
"""
|
|
Checker that checks critical system service status via monit service.
|
|
"""
|
|
|
|
# Cache file to save container_critical_processes
|
|
CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache'
|
|
|
|
CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes'
|
|
|
|
# Command to get merged directory of a container
|
|
GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"'
|
|
|
|
# Command to query the status of monit service.
|
|
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
|
|
|
# Command to get summary of critical system service.
|
|
CHECK_CMD = 'monit summary -B'
|
|
MIN_CHECK_CMD_LINES = 3
|
|
|
|
# Expect status for different system service category.
|
|
EXPECT_STATUS_DICT = {
|
|
'System': 'Running',
|
|
'Process': 'Running',
|
|
'Filesystem': 'Accessible',
|
|
'Program': 'Status ok'
|
|
}
|
|
|
|
def __init__(self):
|
|
HealthChecker.__init__(self)
|
|
self.container_critical_processes = {}
|
|
# Containers that has invalid critical_processes file
|
|
self.bad_containers = set()
|
|
|
|
self.container_feature_dict = {}
|
|
|
|
self.need_save_cache = False
|
|
|
|
self.config_db = None
|
|
|
|
self.load_critical_process_cache()
|
|
|
|
self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
|
|
|
|
def get_expected_running_containers(self, feature_table):
|
|
"""Get a set of containers that are expected to running on SONiC
|
|
|
|
Args:
|
|
feature_table (object): FEATURE table in CONFIG_DB
|
|
|
|
Returns:
|
|
expected_running_containers: A set of container names that are expected running
|
|
container_feature_dict: A dictionary {<container_name>:<feature_name>}
|
|
"""
|
|
expected_running_containers = set()
|
|
container_feature_dict = {}
|
|
|
|
# Get current asic presence list. For multi_asic system, multi instance containers
|
|
# should be checked only for asics present.
|
|
asics_id_presence = multi_asic.get_asic_presence_list()
|
|
|
|
# Some services may run all the instances irrespective of asic presence.
|
|
# Add those to exception list.
|
|
# database service: Currently services have dependency on all database services to
|
|
# be up irrespective of asic presence.
|
|
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
|
|
# it will be removed from exception list.
|
|
run_all_instance_list = ['database', 'bgp']
|
|
|
|
for feature_name, feature_entry in feature_table.items():
|
|
if feature_entry["state"] not in ["disabled", "always_disabled"]:
|
|
if multi_asic.is_multi_asic():
|
|
if feature_entry["has_global_scope"] == "True":
|
|
expected_running_containers.add(feature_name)
|
|
container_feature_dict[feature_name] = feature_name
|
|
if feature_entry["has_per_asic_scope"] == "True":
|
|
num_asics = multi_asic.get_num_asics()
|
|
for asic_id in range(num_asics):
|
|
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
|
|
expected_running_containers.add(feature_name + str(asic_id))
|
|
container_feature_dict[feature_name + str(asic_id)] = feature_name
|
|
else:
|
|
expected_running_containers.add(feature_name)
|
|
container_feature_dict[feature_name] = feature_name
|
|
|
|
return expected_running_containers, container_feature_dict
|
|
|
|
def get_current_running_containers(self):
|
|
"""Get current running containers, if the running container is not in self.container_critical_processes,
|
|
try get the critical process list
|
|
|
|
Returns:
|
|
running_containers: A set of running container names
|
|
"""
|
|
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
|
running_containers = set()
|
|
ctrs = DOCKER_CLIENT.containers
|
|
try:
|
|
lst = ctrs.list(filters={"status": "running"})
|
|
|
|
for ctr in lst:
|
|
running_containers.add(ctr.name)
|
|
if ctr.name not in self.container_critical_processes:
|
|
self.fill_critical_process_by_container(ctr.name)
|
|
except docker.errors.APIError as err:
|
|
logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err))
|
|
|
|
return running_containers
|
|
|
|
def get_critical_process_list_from_file(self, container, critical_processes_file):
|
|
"""Read critical process name list from critical processes file
|
|
|
|
Args:
|
|
container (str): contianer name
|
|
critical_processes_file (str): critical processes file path
|
|
|
|
Returns:
|
|
critical_process_list: A list of critical process names
|
|
"""
|
|
critical_process_list = []
|
|
|
|
with open(critical_processes_file, 'r') as file:
|
|
for line in file:
|
|
# Try to match a line like "program:<process_name>"
|
|
match = re.match(r"^\s*((.+):(.*))*\s*$", line)
|
|
if match is None:
|
|
if container not in self.bad_containers:
|
|
self.bad_containers.add(container)
|
|
logger.log_error('Invalid syntax in critical_processes file of {}'.format(container))
|
|
continue
|
|
if match.group(1) is not None:
|
|
identifier_key = match.group(2).strip()
|
|
identifier_value = match.group(3).strip()
|
|
if identifier_key == "program" and identifier_value:
|
|
critical_process_list.append(identifier_value)
|
|
|
|
return critical_process_list
|
|
|
|
def fill_critical_process_by_container(self, container):
|
|
"""Get critical process for a given container
|
|
|
|
Args:
|
|
container (str): container name
|
|
"""
|
|
# Get container volumn folder
|
|
container_folder = self._get_container_folder(container)
|
|
if not container_folder:
|
|
logger.log_warning('Could not find MergedDir of container {}, was container stopped?'.format(container))
|
|
return
|
|
|
|
if not os.path.exists(container_folder):
|
|
logger.log_warning('MergedDir {} of container {} not found in filesystem, was container stopped?'.format(container_folder, container))
|
|
return
|
|
|
|
# Get critical_processes file path
|
|
critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH)
|
|
if not os.path.isfile(critical_processes_file):
|
|
# Critical process file does not exist, the container has no critical processes.
|
|
logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file))
|
|
self._update_container_critical_processes(container, [])
|
|
return
|
|
|
|
# Get critical process list from critical_processes
|
|
critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file)
|
|
self._update_container_critical_processes(container, critical_process_list)
|
|
|
|
def _update_container_critical_processes(self, container, critical_process_list):
|
|
self.container_critical_processes[container] = critical_process_list
|
|
self.need_save_cache = True
|
|
|
|
def _get_container_folder(self, container):
|
|
container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container))
|
|
if container_folder is None:
|
|
return container_folder
|
|
|
|
return container_folder.strip()
|
|
|
|
def save_critical_process_cache(self):
|
|
"""Save self.container_critical_processes to a cache file
|
|
"""
|
|
if not self.need_save_cache:
|
|
return
|
|
|
|
self.need_save_cache = False
|
|
if not self.container_critical_processes:
|
|
# if container_critical_processes is empty, don't save it
|
|
return
|
|
|
|
if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
|
|
# if cache file exists, remove it
|
|
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)
|
|
|
|
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f:
|
|
pickle.dump(self.container_critical_processes, f)
|
|
|
|
def load_critical_process_cache(self):
|
|
if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE):
|
|
# cache file does not exist
|
|
return
|
|
|
|
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f:
|
|
self.container_critical_processes = pickle.load(f)
|
|
|
|
def reset(self):
|
|
self._info = {}
|
|
|
|
def get_category(self):
|
|
return 'Services'
|
|
|
|
def check_by_monit(self, config):
|
|
"""
|
|
et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any.
|
|
:param config: Health checker configuration.
|
|
:return:
|
|
"""
|
|
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD)
|
|
if not output or output.strip() != 'active':
|
|
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
|
return
|
|
|
|
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
|
lines = output.splitlines()
|
|
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
|
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
|
|
return
|
|
|
|
status_begin = lines[1].find('Status')
|
|
type_begin = lines[1].find('Type')
|
|
if status_begin < 0 or type_begin < 0:
|
|
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
|
return
|
|
|
|
for line in lines[2:]:
|
|
name = line[0:status_begin].strip()
|
|
if config and config.ignore_services and name in config.ignore_services:
|
|
continue
|
|
status = line[status_begin:type_begin].strip()
|
|
service_type = line[type_begin:].strip()
|
|
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
|
continue
|
|
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
|
if expect_status != status:
|
|
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
|
else:
|
|
self.set_object_ok(service_type, name)
|
|
return
|
|
|
|
def check_services(self, config):
|
|
"""Check status of critical services and critical processes
|
|
|
|
Args:
|
|
config (config.Config): Health checker configuration.
|
|
"""
|
|
if not self.config_db:
|
|
self.config_db = swsscommon.ConfigDBConnector()
|
|
self.config_db.connect()
|
|
feature_table = self.config_db.get_table("FEATURE")
|
|
expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table)
|
|
current_running_containers = self.get_current_running_containers()
|
|
|
|
newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers)
|
|
for newly_disabled_container in newly_disabled_containers:
|
|
self.container_critical_processes.pop(newly_disabled_container)
|
|
|
|
self.save_critical_process_cache()
|
|
|
|
not_running_containers = expected_running_containers.difference(current_running_containers)
|
|
for container in not_running_containers:
|
|
self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container))
|
|
|
|
if not self.container_critical_processes:
|
|
# Critical process is empty, not expect
|
|
self.set_object_not_ok('Service', 'system', 'no critical process found')
|
|
return
|
|
|
|
for container, critical_process_list in self.container_critical_processes.items():
|
|
self.check_process_existence(container, critical_process_list, config, feature_table)
|
|
|
|
for bad_container in self.bad_containers:
|
|
self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect')
|
|
|
|
def check(self, config):
|
|
"""Check critical system service status.
|
|
|
|
Args:
|
|
config (object): Health checker configuration.
|
|
"""
|
|
self.reset()
|
|
self.check_by_monit(config)
|
|
self.check_services(config)
|
|
swsscommon.events_deinit_publisher(self.events_handle)
|
|
|
|
def _parse_supervisorctl_status(self, process_status):
|
|
"""Expected input:
|
|
arp_update RUNNING pid 67, uptime 1:03:56
|
|
buffermgrd RUNNING pid 81, uptime 1:03:56
|
|
|
|
Args:
|
|
process_status (list): List of process status
|
|
"""
|
|
data = {}
|
|
for line in process_status:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
items = line.split()
|
|
if len(items) < 2:
|
|
continue
|
|
data[items[0].strip()] = items[1].strip()
|
|
return data
|
|
|
|
def publish_events(self, container_name, critical_process_list):
|
|
params = swsscommon.FieldValueMap()
|
|
params["ctr_name"] = container_name
|
|
for process_name in critical_process_list:
|
|
params["process_name"] = process_name
|
|
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
|
|
|
|
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
|
|
"""Check whether the process in the specified container is running or not.
|
|
|
|
Args:
|
|
container_name (str): Container name
|
|
critical_process_list (list): Critical processes
|
|
config (object): Health checker configuration.
|
|
feature_table (object): Feature table
|
|
"""
|
|
feature_name = self.container_feature_dict[container_name]
|
|
if feature_name in feature_table:
|
|
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
|
|
# If the container is diabled, we exit.
|
|
if ("state" in feature_table[feature_name]
|
|
and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]):
|
|
|
|
# We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because
|
|
# it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent",
|
|
# however, in supervisor.conf it is "/usr/bin/orchagent.sh"
|
|
cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name)
|
|
process_status = utils.run_command(cmd)
|
|
if process_status is None:
|
|
for process_name in critical_process_list:
|
|
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
|
|
self.publish_events(container_name, critical_process_list)
|
|
return
|
|
|
|
process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
|
|
for process_name in critical_process_list:
|
|
if config and config.ignore_services and process_name in config.ignore_services:
|
|
continue
|
|
|
|
# Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container.
|
|
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
|
|
if process_name in process_status:
|
|
if process_status[process_name] != 'RUNNING':
|
|
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
|
|
else:
|
|
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))
|