From 11a93d2f9211b63f85b260887db55872a592c617 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 24 Nov 2021 07:47:48 +0800 Subject: [PATCH] [system-health] No longer check critical process/service status via monit (#9068) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HLD updated here: https://github.com/Azure/SONiC/pull/887 #### Why I did it Command `monit summary -B` can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to address that. monit is still used by system-health to do file system check as well as customize check. #### How I did it 1. Get container names from FEATURE table 2. For each container, collect critical process names from file critical_processes 3. Use “docker exec -it bash -c ‘supervisorctl status’” to get processes status inside container, parse the output and check if any critical processes exit #### How to verify it 1. Add unit test case to cover it 2. Adjust sonic-mgmt cases to cover it 3. Manual test --- rules/system-health.mk | 1 + src/system-health/health_checker/manager.py | 21 +- .../health_checker/service_checker.py | 287 ++++++++++++++- src/system-health/setup.py | 1 + .../tests/etc/supervisor/critical_processes | 2 + .../system_health_monitoring_config.json | 11 + src/system-health/tests/test_system_health.py | 330 +++++++++++++++++- 7 files changed, 624 insertions(+), 29 deletions(-) create mode 100644 src/system-health/tests/etc/supervisor/critical_processes create mode 100644 src/system-health/tests/system_health_monitoring_config.json diff --git a/rules/system-health.mk b/rules/system-health.mk index 00cd48858d..2d1012fe18 100644 --- a/rules/system-health.mk +++ b/rules/system-health.mk @@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl $(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health $(SYSTEM_HEALTH)_PYTHON_VERSION = 3 $(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3) +$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON) SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py index f31e482e33..f6bc31f874 100644 --- a/src/system-health/health_checker/manager.py +++ b/src/system-health/health_checker/manager.py @@ -1,3 +1,11 @@ +from . import utils +from .config import Config +from .health_checker import HealthChecker +from .service_checker import ServiceChecker +from .hardware_checker import HardwareChecker +from .user_defined_checker import UserDefinedChecker + + class HealthCheckerManager(object): """ Manage all system health checkers and system health configuration. @@ -10,7 +18,6 @@ class HealthCheckerManager(object): self._checkers = [] self._state = self.STATE_BOOTING - from .config import Config self.config = Config() self.initialize() @@ -19,8 +26,6 @@ class HealthCheckerManager(object): Initialize the manager. Create service checker and hardware checker by default. :return: """ - from .service_checker import ServiceChecker - from .hardware_checker import HardwareChecker self._checkers.append(ServiceChecker()) self._checkers.append(HardwareChecker()) @@ -31,7 +36,6 @@ class HealthCheckerManager(object): :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that contains the status for all objects that was checked. """ - from .health_checker import HealthChecker HealthChecker.summary = HealthChecker.STATUS_OK stats = {} self.config.load_config() @@ -45,7 +49,6 @@ class HealthCheckerManager(object): self._do_check(checker, stats) if self.config.user_defined_checkers: - from .user_defined_checker import UserDefinedChecker for udc in self.config.user_defined_checkers: checker = UserDefinedChecker(udc) self._do_check(checker, stats) @@ -71,11 +74,12 @@ class HealthCheckerManager(object): else: stats[category].update(info) except Exception as e: - from .health_checker import HealthChecker + HealthChecker.summary = HealthChecker.STATUS_NOT_OK error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) entry = {str(checker): { HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, - HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg + HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg, + HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal" }} if 'Internal' not in stats: stats['Internal'] = entry @@ -83,8 +87,7 @@ class HealthCheckerManager(object): stats['Internal'].update(entry) def _is_system_booting(self): - from .utils import get_uptime - uptime = get_uptime() + uptime = utils.get_uptime() if not self.boot_timeout: self.boot_timeout = self.config.get_bootup_timeout() booting = uptime < self.boot_timeout diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index a98e2d33c3..171f7e2e8b 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -1,12 +1,31 @@ +import docker +import os +import pickle +import re + +from swsscommon import swsscommon +from sonic_py_common import multi_asic +from sonic_py_common.logger import Logger from .health_checker import HealthChecker from . import utils +SYSLOG_IDENTIFIER = 'service_checker' +logger = Logger(log_identifier=SYSLOG_IDENTIFIER) + class ServiceChecker(HealthChecker): """ Checker that checks critical system service status via monit service. """ + # Cache file to save container_critical_processes + CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache' + + CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes' + + # Command to get merged directory of a container + GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"' + # Command to query the status of monit service. CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' @@ -24,6 +43,160 @@ class ServiceChecker(HealthChecker): def __init__(self): HealthChecker.__init__(self) + self.container_critical_processes = {} + # Containers that has invalid critical_processes file + self.bad_containers = set() + + self.container_feature_dict = {} + + self.need_save_cache = False + + self.load_critical_process_cache() + + def get_expected_running_containers(self, feature_table): + """Get a set of containers that are expected to running on SONiC + + Args: + feature_table (object): FEATURE table in CONFIG_DB + + Returns: + expected_running_containers: A set of container names that are expected running + container_feature_dict: A dictionary {:} + """ + expected_running_containers = set() + container_feature_dict = {} + for feature_name, feature_entry in feature_table.items(): + if feature_entry["state"] not in ["disabled", "always_disabled"]: + if multi_asic.is_multi_asic(): + if feature_entry["has_global_scope"] == "True": + expected_running_containers.add(feature_name) + container_feature_dict[feature_name] = feature_name + if feature_entry["has_per_asic_scope"] == "True": + num_asics = multi_asic.get_num_asics() + for asic_id in range(num_asics): + expected_running_containers.add(feature_name + str(asic_id)) + container_feature_dict[feature_name + str(asic_id)] = feature_name + else: + expected_running_containers.add(feature_name) + container_feature_dict[feature_name] = feature_name + + return expected_running_containers, container_feature_dict + + def get_current_running_containers(self): + """Get current running containers, if the running container is not in self.container_critical_processes, + try get the critical process list + + Returns: + running_containers: A set of running container names + """ + DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') + running_containers = set() + ctrs = DOCKER_CLIENT.containers + try: + lst = ctrs.list(filters={"status": "running"}) + + for ctr in lst: + running_containers.add(ctr.name) + if ctr.name not in self.container_critical_processes: + self.fill_critical_process_by_container(ctr.name) + except docker.errors.APIError as err: + logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err)) + + return running_containers + + def get_critical_process_list_from_file(self, container, critical_processes_file): + """Read critical process name list from critical processes file + + Args: + container (str): contianer name + critical_processes_file (str): critical processes file path + + Returns: + critical_process_list: A list of critical process names + """ + critical_process_list = [] + + with open(critical_processes_file, 'r') as file: + for line in file: + # Try to match a line like "program:" + match = re.match(r"^\s*((.+):(.*))*\s*$", line) + if match is None: + if container not in self.bad_containers: + self.bad_containers.add(container) + logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) + continue + + identifier_key = match.group(2).strip() + identifier_value = match.group(3).strip() + if identifier_key == "program" and identifier_value: + critical_process_list.append(identifier_value) + + return critical_process_list + + def fill_critical_process_by_container(self, container): + """Get critical process for a given container + + Args: + container (str): container name + """ + # Get container volumn folder + container_folder = self._get_container_folder(container) + if not container_folder: + logger.log_error('Failed to get container folder for {}'.format(container_folder)) + return + + if not os.path.exists(container_folder): + logger.log_error('Container folder does not exist: {}'.format(container_folder)) + return + + # Get critical_processes file path + critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH) + if not os.path.isfile(critical_processes_file): + # Critical process file does not exist, the container has no critical processes. + logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) + self._update_container_critical_processes(container, []) + return + + # Get critical process list from critical_processes + critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) + self._update_container_critical_processes(container, critical_process_list) + + def _update_container_critical_processes(self, container, critical_process_list): + self.container_critical_processes[container] = critical_process_list + self.need_save_cache = True + + def _get_container_folder(self, container): + container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) + if container_folder is None: + return container_folder + + return container_folder.strip() + + def save_critical_process_cache(self): + """Save self.container_critical_processes to a cache file + """ + if not self.need_save_cache: + return + + self.need_save_cache = False + if not self.container_critical_processes: + # if container_critical_processes is empty, don't save it + return + + if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): + # if cache file exists, remove it + os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) + + with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f: + pickle.dump(self.container_critical_processes, f) + + def load_critical_process_cache(self): + if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE): + # cache file does not exist + return + + with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f: + self.container_critical_processes = pickle.load(f) def reset(self): self._info = {} @@ -31,16 +204,14 @@ class ServiceChecker(HealthChecker): def get_category(self): return 'Services' - def check(self, config): + def check_by_monit(self, config): """ - Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system, - process and file system. + et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any. :param config: Health checker configuration. :return: """ - self.reset() - output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() - if output != 'active': + output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) + if not output or output.strip() != 'active': self.set_object_not_ok('Service', 'monit', 'monit service is not running') return @@ -58,7 +229,7 @@ class ServiceChecker(HealthChecker): for line in lines[2:]: name = line[0:status_begin].strip() - if config.ignore_services and name in config.ignore_services: + if config and config.ignore_services and name in config.ignore_services: continue status = line[status_begin:type_begin].strip() service_type = line[type_begin:].strip() @@ -70,3 +241,105 @@ class ServiceChecker(HealthChecker): else: self.set_object_ok(service_type, name) return + + def check_services(self, config): + """Check status of critical services and critical processes + + Args: + config (config.Config): Health checker configuration. + """ + config_db = swsscommon.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table) + current_running_containers = self.get_current_running_containers() + + newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers) + for newly_disabled_container in newly_disabled_containers: + self.container_critical_processes.pop(newly_disabled_container) + + self.save_critical_process_cache() + + not_running_containers = expected_running_containers.difference(current_running_containers) + for container in not_running_containers: + self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container)) + + if not self.container_critical_processes: + # Critical process is empty, not expect + self.set_object_not_ok('Service', 'system', 'no critical process found') + return + + for container, critical_process_list in self.container_critical_processes.items(): + self.check_process_existence(container, critical_process_list, config, feature_table) + + for bad_container in self.bad_containers: + self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect') + + def check(self, config): + """Check critical system service status. + + Args: + config (object): Health checker configuration. + """ + self.reset() + self.check_by_monit(config) + self.check_services(config) + + + def _parse_supervisorctl_status(self, process_status): + """Expected input: + arp_update RUNNING pid 67, uptime 1:03:56 + buffermgrd RUNNING pid 81, uptime 1:03:56 + + Args: + process_status (list): List of process status + """ + data = {} + for line in process_status: + line = line.strip() + if not line: + continue + items = line.split() + if len(items) < 2: + continue + data[items[0].strip()] = items[1].strip() + return data + + def check_process_existence(self, container_name, critical_process_list, config, feature_table): + """Check whether the process in the specified container is running or not. + + Args: + container_name (str): Container name + critical_process_list (list): Critical processes + config (object): Health checker configuration. + feature_table (object): Feature table + """ + feature_name = self.container_feature_dict[container_name] + if feature_name in feature_table: + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[feature_name] + and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]): + + # We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because + # it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent", + # however, in supervisor.conf it is "/usr/bin/orchagent.sh" + cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name) + process_status = utils.run_command(cmd) + if process_status is None: + for process_name in critical_process_list: + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + return + + process_status = self._parse_supervisorctl_status(process_status.strip().splitlines()) + for process_name in critical_process_list: + if config and config.ignore_services and process_name in config.ignore_services: + continue + + # Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container. + # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. + if process_name in process_status: + if process_status[process_name] != 'RUNNING': + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + else: + self.set_object_ok('Process', '{}:{}'.format(container_name, process_name)) diff --git a/src/system-health/setup.py b/src/system-health/setup.py index b0cc998e48..62b02252e8 100644 --- a/src/system-health/setup.py +++ b/src/system-health/setup.py @@ -3,6 +3,7 @@ from setuptools import setup dependencies = [ 'natsort', 'sonic_py_common', + 'docker' ] setup( diff --git a/src/system-health/tests/etc/supervisor/critical_processes b/src/system-health/tests/etc/supervisor/critical_processes new file mode 100644 index 0000000000..cff479d7fa --- /dev/null +++ b/src/system-health/tests/etc/supervisor/critical_processes @@ -0,0 +1,2 @@ +program:snmpd +program:snmp-subagent diff --git a/src/system-health/tests/system_health_monitoring_config.json b/src/system-health/tests/system_health_monitoring_config.json new file mode 100644 index 0000000000..7284be7d16 --- /dev/null +++ b/src/system-health/tests/system_health_monitoring_config.json @@ -0,0 +1,11 @@ +{ + "services_to_ignore": ["dummy_service"], + "devices_to_ignore": ["psu.voltage"], + "user_defined_checkers": [], + "polling_interval": 60, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + } +} diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 8e3d3c77e9..c7ff86c28b 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -8,6 +8,7 @@ 2. HealthCheckerManager 3. Config """ +import copy import os import sys from swsscommon import swsscommon @@ -30,11 +31,21 @@ from health_checker.manager import HealthCheckerManager from health_checker.service_checker import ServiceChecker from health_checker.user_defined_checker import UserDefinedChecker +mock_supervisorctl_output = """ +snmpd RUNNING pid 67, uptime 1:03:56 +snmp-subagent EXITED Oct 19 01:53 AM +""" device_info.get_platform = MagicMock(return_value='unittest') -def test_user_defined_checker(): - utils.run_command = MagicMock(return_value='') +def setup(): + if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): + os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) + + +@patch('health_checker.utils.run_command') +def test_user_defined_checker(mock_run): + mock_run.return_value = '' checker = UserDefinedChecker('') checker.check(None) @@ -43,29 +54,195 @@ def test_user_defined_checker(): checker.reset() assert len(checker._info) == 0 - utils.run_command = MagicMock(return_value='\n\n\n') + mock_run.return_value = '\n\n\n' checker.check(None) assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n' - utils.run_command = MagicMock(return_value=valid_output) + mock_run.return_value = valid_output checker.check(None) + assert checker.get_category() == 'MyCategory' assert 'Device1' in checker._info assert 'Device2' in checker._info assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK -def test_service_checker(): - return_value = '' +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_client): + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data + mock_get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', - def mock_run_command(cmd): - if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD: - return 'active' - else: - return return_value + } + } + mock_containers = MagicMock() + mock_snmp_container = MagicMock() + mock_snmp_container.name = 'snmp' + mock_containers.list = MagicMock(return_value=[mock_snmp_container]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers - utils.run_command = mock_run_command + mock_run.return_value = mock_supervisorctl_output + + checker = ServiceChecker() + assert checker.get_category() == 'Services' + config = Config() + checker.check(config) + assert 'snmp:snmpd' in checker._info + assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'snmp:snmp-subagent' in checker._info + assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_get_table.return_value = { + 'new_service': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + }, + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + } + } + mock_ns_container = MagicMock() + mock_ns_container.name = 'new_service' + mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) + checker.check(config) + assert 'new_service' in checker.container_critical_processes + + assert 'new_service:snmpd' in checker._info + assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'new_service:snmp-subagent' in checker._info + assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_containers.list = MagicMock(return_value=[mock_snmp_container]) + checker.check(config) + assert 'new_service' in checker._info + assert checker._info['new_service'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) + mock_run.return_value = None + checker.check(config) + assert 'new_service:snmpd' in checker._info + assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'new_service:snmp-subagent' in checker._info + assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + origin_container_critical_processes = copy.deepcopy(checker.container_critical_processes) + checker.save_critical_process_cache() + checker.load_critical_process_cache() + assert origin_container_critical_processes == checker.container_critical_processes + + + +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) +@patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output)) +@patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=True)) +@patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)])) +@patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value='')) +@patch('docker.DockerClient') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_multi_asic(mock_config_db, mock_docker_client): + mock_db_data = MagicMock() + mock_db_data.get_table = MagicMock() + mock_config_db.return_value = mock_db_data + + mock_db_data.get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'True', + + } + } + + mock_containers = MagicMock() + mock_snmp_container = MagicMock() + mock_snmp_container.name = 'snmp' + list_return_value = [mock_snmp_container] + for i in range(3): + mock_container = MagicMock() + mock_container.name = 'snmp' + str(i) + list_return_value.append(mock_container) + + mock_containers.list = MagicMock(return_value=list_return_value) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + + checker = ServiceChecker() + + config = Config() + checker.check(config) + assert 'snmp' in checker.container_critical_processes + assert 'snmp:snmpd' in checker._info + assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp0:snmpd' in checker._info + assert checker._info['snmp0:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp1:snmpd' in checker._info + assert checker._info['snmp1:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp2:snmpd' in checker._info + assert checker._info['snmp2:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'snmp:snmp-subagent' in checker._info + assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp0:snmp-subagent' in checker._info + assert checker._info['snmp0:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp1:snmp-subagent' in checker._info + assert checker._info['snmp1:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp2:snmp-subagent' in checker._info + assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock()) +@patch('docker.DockerClient') +@patch('swsscommon.swsscommon.ConfigDBConnector.get_table') +def test_service_checker_no_critical_process(mock_get_table, mock_docker_client): + mock_get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'True', + + } + } + mock_containers = MagicMock() + mock_containers.list = MagicMock(return_value=[]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + + checker = ServiceChecker() + config = Config() + checker.check(config) + assert 'system' in checker._info + assert checker._info['system'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + +@patch('health_checker.service_checker.ServiceChecker.check_services', MagicMock()) +@patch('health_checker.utils.run_command') +def test_service_checker_check_by_monit(mock_run): return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \ 'Service Name Status Type\n' \ 'sonic Running System\n' \ @@ -74,7 +251,7 @@ def test_service_checker(): 'orchagent Running Process\n' \ 'root-overlay Accessible Filesystem\n' \ 'var-log Is not accessible Filesystem\n' - + mock_run.side_effect = ['active', return_value] checker = ServiceChecker() config = Config() checker.check(config) @@ -185,6 +362,7 @@ def test_hardware_checker(): }) checker = HardwareChecker() + assert checker.get_category() == 'Hardware' config = Config() checker.check(config) @@ -217,3 +395,129 @@ def test_hardware_checker(): assert 'PSU 5' in checker._info assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_config(): + config = Config() + config._config_file = os.path.join(test_path, Config.CONFIG_FILE) + + assert config.config_file_exists() + config.load_config() + assert config.interval == 60 + assert 'dummy_service' in config.ignore_services + assert 'psu.voltage' in config.ignore_devices + assert len(config.user_defined_checkers) == 0 + + assert config.get_led_color('fault') == 'orange' + assert config.get_led_color('normal') == 'green' + assert config.get_led_color('booting') == 'orange_blink' + assert config.get_bootup_timeout() == 300 + + config._reset() + assert not config.ignore_services + assert not config.ignore_devices + assert not config.user_defined_checkers + assert not config.config_data + + assert config.get_led_color('fault') == 'red' + assert config.get_led_color('normal') == 'green' + assert config.get_led_color('booting') == 'orange_blink' + + config._last_mtime = 1 + config._config_file = 'notExistFile' + config.load_config() + assert not config._last_mtime + + +@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker.check', MagicMock()) +@patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock()) +@patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.get_table', MagicMock()) +@patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine')) +@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info') +@patch('health_checker.service_checker.ServiceChecker.get_info') +@patch('health_checker.hardware_checker.HardwareChecker.get_info') +@patch('health_checker.utils.get_uptime') +def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info): + chassis = MagicMock() + chassis.set_status_led = MagicMock() + + manager = HealthCheckerManager() + manager.config.user_defined_checkers = ['some check'] + assert manager._state == HealthCheckerManager.STATE_BOOTING + assert len(manager._checkers) == 2 + + mock_uptime.return_value = 200 + assert manager._is_system_booting() + state, stat = manager.check(chassis) + assert state == HealthCheckerManager.STATE_BOOTING + assert len(stat) == 0 + chassis.set_status_led.assert_called_with('orange_blink') + + mock_uptime.return_value = 500 + assert not manager._is_system_booting() + assert manager._state == HealthCheckerManager.STATE_RUNNING + mock_hw_info.return_value = { + 'ASIC': { + 'type': 'ASIC', + 'message': '', + 'status': 'OK' + }, + 'fan1': { + 'type': 'Fan', + 'message': '', + 'status': 'OK' + }, + } + mock_service_info.return_value = { + 'snmp:snmpd': { + 'type': 'Process', + 'message': '', + 'status': 'OK' + } + } + mock_udc_info.return_value = { + 'udc': { + 'type': 'Database', + 'message': '', + 'status': 'OK' + } + } + state, stat = manager.check(chassis) + assert state == HealthCheckerManager.STATE_RUNNING + assert 'Services' in stat + assert stat['Services']['snmp:snmpd']['status'] == 'OK' + + assert 'Hardware' in stat + assert stat['Hardware']['ASIC']['status'] == 'OK' + assert stat['Hardware']['fan1']['status'] == 'OK' + + assert 'UserDefine' in stat + assert stat['UserDefine']['udc']['status'] == 'OK' + + mock_hw_info.side_effect = RuntimeError() + mock_service_info.side_effect = RuntimeError() + mock_udc_info.side_effect = RuntimeError() + state, stat = manager.check(chassis) + assert 'Internal' in stat + assert stat['Internal']['ServiceChecker']['status'] == 'Not OK' + assert stat['Internal']['HardwareChecker']['status'] == 'Not OK' + assert stat['Internal']['UserDefinedChecker - some check']['status'] == 'Not OK' + + chassis.set_status_led.side_effect = NotImplementedError() + manager._set_system_led(chassis, manager.config, 'normal') + + chassis.set_status_led.side_effect = RuntimeError() + manager._set_system_led(chassis, manager.config, 'normal') + +def test_utils(): + output = utils.run_command('some invalid command') + assert not output + + output = utils.run_command('ls') + assert output + + uptime = utils.get_uptime() + assert uptime > 0