import docker import os import pickle import re from swsscommon import swsscommon from sonic_py_common import multi_asic from sonic_py_common.logger import Logger from .health_checker import HealthChecker from . import utils SYSLOG_IDENTIFIER = 'service_checker' logger = Logger(log_identifier=SYSLOG_IDENTIFIER) EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-not-running" class ServiceChecker(HealthChecker): """ Checker that checks critical system service status via monit service. """ # Cache file to save container_critical_processes CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache' CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes' # Command to get merged directory of a container GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"' # Command to query the status of monit service. CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' # Command to get summary of critical system service. CHECK_CMD = 'monit summary -B' MIN_CHECK_CMD_LINES = 3 # Expect status for different system service category. EXPECT_STATUS_DICT = { 'System': 'Running', 'Process': 'Running', 'Filesystem': 'Accessible', 'Program': 'Status ok' } def __init__(self): HealthChecker.__init__(self) self.container_critical_processes = {} # Containers that has invalid critical_processes file self.bad_containers = set() self.container_feature_dict = {} self.need_save_cache = False self.config_db = None self.load_critical_process_cache() self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) def get_expected_running_containers(self, feature_table): """Get a set of containers that are expected to running on SONiC Args: feature_table (object): FEATURE table in CONFIG_DB Returns: expected_running_containers: A set of container names that are expected running container_feature_dict: A dictionary {:} """ expected_running_containers = set() container_feature_dict = {} # Get current asic presence list. For multi_asic system, multi instance containers # should be checked only for asics present. asics_id_presence = multi_asic.get_asic_presence_list() # Some services may run all the instances irrespective of asic presence. # Add those to exception list. # database service: Currently services have dependency on all database services to # be up irrespective of asic presence. # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven, # it will be removed from exception list. run_all_instance_list = ['database', 'bgp'] for feature_name, feature_entry in feature_table.items(): if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): if feature_entry["has_global_scope"] == "True": expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name if feature_entry["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): if asic_id in asics_id_presence or feature_name in run_all_instance_list: expected_running_containers.add(feature_name + str(asic_id)) container_feature_dict[feature_name + str(asic_id)] = feature_name else: expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name return expected_running_containers, container_feature_dict def get_current_running_containers(self): """Get current running containers, if the running container is not in self.container_critical_processes, try get the critical process list Returns: running_containers: A set of running container names """ DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') running_containers = set() ctrs = DOCKER_CLIENT.containers try: lst = ctrs.list(filters={"status": "running"}) for ctr in lst: running_containers.add(ctr.name) if ctr.name not in self.container_critical_processes: self.fill_critical_process_by_container(ctr.name) except docker.errors.APIError as err: logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err)) return running_containers def get_critical_process_list_from_file(self, container, critical_processes_file): """Read critical process name list from critical processes file Args: container (str): contianer name critical_processes_file (str): critical processes file path Returns: critical_process_list: A list of critical process names """ critical_process_list = [] with open(critical_processes_file, 'r') as file: for line in file: # Try to match a line like "program:" match = re.match(r"^\s*((.+):(.*))*\s*$", line) if match is None: if container not in self.bad_containers: self.bad_containers.add(container) logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) continue if match.group(1) is not None: identifier_key = match.group(2).strip() identifier_value = match.group(3).strip() if identifier_key == "program" and identifier_value: critical_process_list.append(identifier_value) return critical_process_list def fill_critical_process_by_container(self, container): """Get critical process for a given container Args: container (str): container name """ # Get container volumn folder container_folder = self._get_container_folder(container) if not container_folder: logger.log_warning('Could not find MergedDir of container {}, was container stopped?'.format(container)) return if not os.path.exists(container_folder): logger.log_warning('MergedDir {} of container {} not found in filesystem, was container stopped?'.format(container_folder, container)) return # Get critical_processes file path critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH) if not os.path.isfile(critical_processes_file): # Critical process file does not exist, the container has no critical processes. logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) self._update_container_critical_processes(container, []) return # Get critical process list from critical_processes critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) self._update_container_critical_processes(container, critical_process_list) def _update_container_critical_processes(self, container, critical_process_list): self.container_critical_processes[container] = critical_process_list self.need_save_cache = True def _get_container_folder(self, container): container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) if container_folder is None: return container_folder return container_folder.strip() def save_critical_process_cache(self): """Save self.container_critical_processes to a cache file """ if not self.need_save_cache: return self.need_save_cache = False if not self.container_critical_processes: # if container_critical_processes is empty, don't save it return if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): # if cache file exists, remove it os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f: pickle.dump(self.container_critical_processes, f) def load_critical_process_cache(self): if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE): # cache file does not exist return with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f: self.container_critical_processes = pickle.load(f) def reset(self): self._info = {} def get_category(self): return 'Services' def check_by_monit(self, config): """ et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any. :param config: Health checker configuration. :return: """ output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) if not output or output.strip() != 'active': self.set_object_not_ok('Service', 'monit', 'monit service is not running') return output = utils.run_command(ServiceChecker.CHECK_CMD) lines = output.splitlines() if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: self.set_object_not_ok('Service', 'monit', 'monit service is not ready') return status_begin = lines[1].find('Status') type_begin = lines[1].find('Type') if status_begin < 0 or type_begin < 0: self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') return for line in lines[2:]: name = line[0:status_begin].strip() if config and config.ignore_services and name in config.ignore_services: continue status = line[status_begin:type_begin].strip() service_type = line[type_begin:].strip() if service_type not in ServiceChecker.EXPECT_STATUS_DICT: continue expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type] if expect_status != status: self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status)) else: self.set_object_ok(service_type, name) return def check_services(self, config): """Check status of critical services and critical processes Args: config (config.Config): Health checker configuration. """ if not self.config_db: self.config_db = swsscommon.ConfigDBConnector() self.config_db.connect() feature_table = self.config_db.get_table("FEATURE") expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table) current_running_containers = self.get_current_running_containers() newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers) for newly_disabled_container in newly_disabled_containers: self.container_critical_processes.pop(newly_disabled_container) self.save_critical_process_cache() not_running_containers = expected_running_containers.difference(current_running_containers) for container in not_running_containers: self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container)) if not self.container_critical_processes: # Critical process is empty, not expect self.set_object_not_ok('Service', 'system', 'no critical process found') return for container, critical_process_list in self.container_critical_processes.items(): self.check_process_existence(container, critical_process_list, config, feature_table) for bad_container in self.bad_containers: self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect') def check(self, config): """Check critical system service status. Args: config (object): Health checker configuration. """ self.reset() self.check_by_monit(config) self.check_services(config) swsscommon.events_deinit_publisher(self.events_handle) def _parse_supervisorctl_status(self, process_status): """Expected input: arp_update RUNNING pid 67, uptime 1:03:56 buffermgrd RUNNING pid 81, uptime 1:03:56 Args: process_status (list): List of process status """ data = {} for line in process_status: line = line.strip() if not line: continue items = line.split() if len(items) < 2: continue data[items[0].strip()] = items[1].strip() return data def publish_events(self, container_name, critical_process_list): params = swsscommon.FieldValueMap() params["ctr_name"] = container_name for process_name in critical_process_list: params["process_name"] = process_name swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params) def check_process_existence(self, container_name, critical_process_list, config, feature_table): """Check whether the process in the specified container is running or not. Args: container_name (str): Container name critical_process_list (list): Critical processes config (object): Health checker configuration. feature_table (object): Feature table """ feature_name = self.container_feature_dict[container_name] if feature_name in feature_table: # We look into the 'FEATURE' table to verify whether the container is disabled or not. # If the container is diabled, we exit. if ("state" in feature_table[feature_name] and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]): # We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because # it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent", # however, in supervisor.conf it is "/usr/bin/orchagent.sh" cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name) process_status = utils.run_command(cmd) if process_status is None: for process_name in critical_process_list: self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) self.publish_events(container_name, critical_process_list) return process_status = self._parse_supervisorctl_status(process_status.strip().splitlines()) for process_name in critical_process_list: if config and config.ignore_services and process_name in config.ignore_services: continue # Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container. # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. if process_name in process_status: if process_status[process_name] != 'RUNNING': self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) else: self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))