[system-health] No longer check critical process/service status via monit (#9068)

HLD updated here: https://github.com/Azure/SONiC/pull/887

#### Why I did it

Command `monit summary -B` can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to address that. monit is still used by system-health to do file system check as well as customize check.

#### How I did it

1.	Get container names from FEATURE table
2.	For each container, collect critical process names from file critical_processes
3.	Use “docker exec -it <container_name> bash -c ‘supervisorctl status’” to get processes status inside container, parse the output and check if any critical processes exit

#### How to verify it

1. Add unit test case to cover it
2. Adjust sonic-mgmt cases to cover it
3. Manual test
This commit is contained in:
Junchao-Mellanox 2021-11-24 07:47:48 +08:00 committed by GitHub
parent 240596ec7d
commit 11a93d2f92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 624 additions and 29 deletions

View File

@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3)
$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON)
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"

View File

@ -1,3 +1,11 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
from .user_defined_checker import UserDefinedChecker
class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
@ -10,7 +18,6 @@ class HealthCheckerManager(object):
self._checkers = []
self._state = self.STATE_BOOTING
from .config import Config
self.config = Config()
self.initialize()
@ -19,8 +26,6 @@ class HealthCheckerManager(object):
Initialize the manager. Create service checker and hardware checker by default.
:return:
"""
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
self._checkers.append(ServiceChecker())
self._checkers.append(HardwareChecker())
@ -31,7 +36,6 @@ class HealthCheckerManager(object):
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
"""
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
@ -45,7 +49,6 @@ class HealthCheckerManager(object):
self._do_check(checker, stats)
if self.config.user_defined_checkers:
from .user_defined_checker import UserDefinedChecker
for udc in self.config.user_defined_checkers:
checker = UserDefinedChecker(udc)
self._do_check(checker, stats)
@ -71,11 +74,12 @@ class HealthCheckerManager(object):
else:
stats[category].update(info)
except Exception as e:
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
entry = {str(checker): {
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg,
HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal"
}}
if 'Internal' not in stats:
stats['Internal'] = entry
@ -83,8 +87,7 @@ class HealthCheckerManager(object):
stats['Internal'].update(entry)
def _is_system_booting(self):
from .utils import get_uptime
uptime = get_uptime()
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout

View File

@ -1,12 +1,31 @@
import docker
import os
import pickle
import re
from swsscommon import swsscommon
from sonic_py_common import multi_asic
from sonic_py_common.logger import Logger
from .health_checker import HealthChecker
from . import utils
SYSLOG_IDENTIFIER = 'service_checker'
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
class ServiceChecker(HealthChecker):
"""
Checker that checks critical system service status via monit service.
"""
# Cache file to save container_critical_processes
CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache'
CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes'
# Command to get merged directory of a container
GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"'
# Command to query the status of monit service.
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
@ -24,6 +43,160 @@ class ServiceChecker(HealthChecker):
def __init__(self):
HealthChecker.__init__(self)
self.container_critical_processes = {}
# Containers that has invalid critical_processes file
self.bad_containers = set()
self.container_feature_dict = {}
self.need_save_cache = False
self.load_critical_process_cache()
def get_expected_running_containers(self, feature_table):
"""Get a set of containers that are expected to running on SONiC
Args:
feature_table (object): FEATURE table in CONFIG_DB
Returns:
expected_running_containers: A set of container names that are expected running
container_feature_dict: A dictionary {<container_name>:<feature_name>}
"""
expected_running_containers = set()
container_feature_dict = {}
for feature_name, feature_entry in feature_table.items():
if feature_entry["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
if feature_entry["has_global_scope"] == "True":
expected_running_containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
if feature_entry["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
else:
expected_running_containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
return expected_running_containers, container_feature_dict
def get_current_running_containers(self):
"""Get current running containers, if the running container is not in self.container_critical_processes,
try get the critical process list
Returns:
running_containers: A set of running container names
"""
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
running_containers = set()
ctrs = DOCKER_CLIENT.containers
try:
lst = ctrs.list(filters={"status": "running"})
for ctr in lst:
running_containers.add(ctr.name)
if ctr.name not in self.container_critical_processes:
self.fill_critical_process_by_container(ctr.name)
except docker.errors.APIError as err:
logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err))
return running_containers
def get_critical_process_list_from_file(self, container, critical_processes_file):
"""Read critical process name list from critical processes file
Args:
container (str): contianer name
critical_processes_file (str): critical processes file path
Returns:
critical_process_list: A list of critical process names
"""
critical_process_list = []
with open(critical_processes_file, 'r') as file:
for line in file:
# Try to match a line like "program:<process_name>"
match = re.match(r"^\s*((.+):(.*))*\s*$", line)
if match is None:
if container not in self.bad_containers:
self.bad_containers.add(container)
logger.log_error('Invalid syntax in critical_processes file of {}'.format(container))
continue
identifier_key = match.group(2).strip()
identifier_value = match.group(3).strip()
if identifier_key == "program" and identifier_value:
critical_process_list.append(identifier_value)
return critical_process_list
def fill_critical_process_by_container(self, container):
"""Get critical process for a given container
Args:
container (str): container name
"""
# Get container volumn folder
container_folder = self._get_container_folder(container)
if not container_folder:
logger.log_error('Failed to get container folder for {}'.format(container_folder))
return
if not os.path.exists(container_folder):
logger.log_error('Container folder does not exist: {}'.format(container_folder))
return
# Get critical_processes file path
critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH)
if not os.path.isfile(critical_processes_file):
# Critical process file does not exist, the container has no critical processes.
logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file))
self._update_container_critical_processes(container, [])
return
# Get critical process list from critical_processes
critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file)
self._update_container_critical_processes(container, critical_process_list)
def _update_container_critical_processes(self, container, critical_process_list):
self.container_critical_processes[container] = critical_process_list
self.need_save_cache = True
def _get_container_folder(self, container):
container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container))
if container_folder is None:
return container_folder
return container_folder.strip()
def save_critical_process_cache(self):
"""Save self.container_critical_processes to a cache file
"""
if not self.need_save_cache:
return
self.need_save_cache = False
if not self.container_critical_processes:
# if container_critical_processes is empty, don't save it
return
if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
# if cache file exists, remove it
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f:
pickle.dump(self.container_critical_processes, f)
def load_critical_process_cache(self):
if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE):
# cache file does not exist
return
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f:
self.container_critical_processes = pickle.load(f)
def reset(self):
self._info = {}
@ -31,16 +204,14 @@ class ServiceChecker(HealthChecker):
def get_category(self):
return 'Services'
def check(self, config):
def check_by_monit(self, config):
"""
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
process and file system.
et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any.
:param config: Health checker configuration.
:return:
"""
self.reset()
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
if output != 'active':
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD)
if not output or output.strip() != 'active':
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
return
@ -58,7 +229,7 @@ class ServiceChecker(HealthChecker):
for line in lines[2:]:
name = line[0:status_begin].strip()
if config.ignore_services and name in config.ignore_services:
if config and config.ignore_services and name in config.ignore_services:
continue
status = line[status_begin:type_begin].strip()
service_type = line[type_begin:].strip()
@ -70,3 +241,105 @@ class ServiceChecker(HealthChecker):
else:
self.set_object_ok(service_type, name)
return
def check_services(self, config):
"""Check status of critical services and critical processes
Args:
config (config.Config): Health checker configuration.
"""
config_db = swsscommon.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")
expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table)
current_running_containers = self.get_current_running_containers()
newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers)
for newly_disabled_container in newly_disabled_containers:
self.container_critical_processes.pop(newly_disabled_container)
self.save_critical_process_cache()
not_running_containers = expected_running_containers.difference(current_running_containers)
for container in not_running_containers:
self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container))
if not self.container_critical_processes:
# Critical process is empty, not expect
self.set_object_not_ok('Service', 'system', 'no critical process found')
return
for container, critical_process_list in self.container_critical_processes.items():
self.check_process_existence(container, critical_process_list, config, feature_table)
for bad_container in self.bad_containers:
self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect')
def check(self, config):
"""Check critical system service status.
Args:
config (object): Health checker configuration.
"""
self.reset()
self.check_by_monit(config)
self.check_services(config)
def _parse_supervisorctl_status(self, process_status):
"""Expected input:
arp_update RUNNING pid 67, uptime 1:03:56
buffermgrd RUNNING pid 81, uptime 1:03:56
Args:
process_status (list): List of process status
"""
data = {}
for line in process_status:
line = line.strip()
if not line:
continue
items = line.split()
if len(items) < 2:
continue
data[items[0].strip()] = items[1].strip()
return data
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
"""Check whether the process in the specified container is running or not.
Args:
container_name (str): Container name
critical_process_list (list): Critical processes
config (object): Health checker configuration.
feature_table (object): Feature table
"""
feature_name = self.container_feature_dict[container_name]
if feature_name in feature_table:
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
# If the container is diabled, we exit.
if ("state" in feature_table[feature_name]
and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]):
# We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because
# it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent",
# however, in supervisor.conf it is "/usr/bin/orchagent.sh"
cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name)
process_status = utils.run_command(cmd)
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
return
process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
for process_name in critical_process_list:
if config and config.ignore_services and process_name in config.ignore_services:
continue
# Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container.
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
if process_name in process_status:
if process_status[process_name] != 'RUNNING':
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
else:
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))

View File

@ -3,6 +3,7 @@ from setuptools import setup
dependencies = [
'natsort',
'sonic_py_common',
'docker'
]
setup(

View File

@ -0,0 +1,2 @@
program:snmpd
program:snmp-subagent

View File

@ -0,0 +1,11 @@
{
"services_to_ignore": ["dummy_service"],
"devices_to_ignore": ["psu.voltage"],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}

View File

@ -8,6 +8,7 @@
2. HealthCheckerManager
3. Config
"""
import copy
import os
import sys
from swsscommon import swsscommon
@ -30,11 +31,21 @@ from health_checker.manager import HealthCheckerManager
from health_checker.service_checker import ServiceChecker
from health_checker.user_defined_checker import UserDefinedChecker
mock_supervisorctl_output = """
snmpd RUNNING pid 67, uptime 1:03:56
snmp-subagent EXITED Oct 19 01:53 AM
"""
device_info.get_platform = MagicMock(return_value='unittest')
def test_user_defined_checker():
utils.run_command = MagicMock(return_value='')
def setup():
if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)
@patch('health_checker.utils.run_command')
def test_user_defined_checker(mock_run):
mock_run.return_value = ''
checker = UserDefinedChecker('')
checker.check(None)
@ -43,29 +54,195 @@ def test_user_defined_checker():
checker.reset()
assert len(checker._info) == 0
utils.run_command = MagicMock(return_value='\n\n\n')
mock_run.return_value = '\n\n\n'
checker.check(None)
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n'
utils.run_command = MagicMock(return_value=valid_output)
mock_run.return_value = valid_output
checker.check(None)
assert checker.get_category() == 'MyCategory'
assert 'Device1' in checker._info
assert 'Device2' in checker._info
assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
def test_service_checker():
return_value = ''
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
@patch('docker.DockerClient')
@patch('health_checker.utils.run_command')
@patch('swsscommon.swsscommon.ConfigDBConnector')
def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_client):
mock_db_data = MagicMock()
mock_get_table = MagicMock()
mock_db_data.get_table = mock_get_table
mock_config_db.return_value = mock_db_data
mock_get_table.return_value = {
'snmp': {
'state': 'enabled',
'has_global_scope': 'True',
'has_per_asic_scope': 'False',
def mock_run_command(cmd):
if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD:
return 'active'
else:
return return_value
}
}
mock_containers = MagicMock()
mock_snmp_container = MagicMock()
mock_snmp_container.name = 'snmp'
mock_containers.list = MagicMock(return_value=[mock_snmp_container])
mock_docker_client_object = MagicMock()
mock_docker_client.return_value = mock_docker_client_object
mock_docker_client_object.containers = mock_containers
utils.run_command = mock_run_command
mock_run.return_value = mock_supervisorctl_output
checker = ServiceChecker()
assert checker.get_category() == 'Services'
config = Config()
checker.check(config)
assert 'snmp:snmpd' in checker._info
assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'snmp:snmp-subagent' in checker._info
assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
mock_get_table.return_value = {
'new_service': {
'state': 'enabled',
'has_global_scope': 'True',
'has_per_asic_scope': 'False',
},
'snmp': {
'state': 'enabled',
'has_global_scope': 'True',
'has_per_asic_scope': 'False',
}
}
mock_ns_container = MagicMock()
mock_ns_container.name = 'new_service'
mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container])
checker.check(config)
assert 'new_service' in checker.container_critical_processes
assert 'new_service:snmpd' in checker._info
assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'new_service:snmp-subagent' in checker._info
assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
mock_containers.list = MagicMock(return_value=[mock_snmp_container])
checker.check(config)
assert 'new_service' in checker._info
assert checker._info['new_service'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container])
mock_run.return_value = None
checker.check(config)
assert 'new_service:snmpd' in checker._info
assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'new_service:snmp-subagent' in checker._info
assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
origin_container_critical_processes = copy.deepcopy(checker.container_critical_processes)
checker.save_critical_process_cache()
checker.load_critical_process_cache()
assert origin_container_critical_processes == checker.container_critical_processes
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
@patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output))
@patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3))
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=True))
@patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)]))
@patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value=''))
@patch('docker.DockerClient')
@patch('swsscommon.swsscommon.ConfigDBConnector')
def test_service_checker_multi_asic(mock_config_db, mock_docker_client):
mock_db_data = MagicMock()
mock_db_data.get_table = MagicMock()
mock_config_db.return_value = mock_db_data
mock_db_data.get_table.return_value = {
'snmp': {
'state': 'enabled',
'has_global_scope': 'True',
'has_per_asic_scope': 'True',
}
}
mock_containers = MagicMock()
mock_snmp_container = MagicMock()
mock_snmp_container.name = 'snmp'
list_return_value = [mock_snmp_container]
for i in range(3):
mock_container = MagicMock()
mock_container.name = 'snmp' + str(i)
list_return_value.append(mock_container)
mock_containers.list = MagicMock(return_value=list_return_value)
mock_docker_client_object = MagicMock()
mock_docker_client.return_value = mock_docker_client_object
mock_docker_client_object.containers = mock_containers
checker = ServiceChecker()
config = Config()
checker.check(config)
assert 'snmp' in checker.container_critical_processes
assert 'snmp:snmpd' in checker._info
assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'snmp0:snmpd' in checker._info
assert checker._info['snmp0:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'snmp1:snmpd' in checker._info
assert checker._info['snmp1:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'snmp2:snmpd' in checker._info
assert checker._info['snmp2:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'snmp:snmp-subagent' in checker._info
assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'snmp0:snmp-subagent' in checker._info
assert checker._info['snmp0:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'snmp1:snmp-subagent' in checker._info
assert checker._info['snmp1:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'snmp2:snmp-subagent' in checker._info
assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock())
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
@patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock())
@patch('docker.DockerClient')
@patch('swsscommon.swsscommon.ConfigDBConnector.get_table')
def test_service_checker_no_critical_process(mock_get_table, mock_docker_client):
mock_get_table.return_value = {
'snmp': {
'state': 'enabled',
'has_global_scope': 'True',
'has_per_asic_scope': 'True',
}
}
mock_containers = MagicMock()
mock_containers.list = MagicMock(return_value=[])
mock_docker_client_object = MagicMock()
mock_docker_client.return_value = mock_docker_client_object
mock_docker_client_object.containers = mock_containers
checker = ServiceChecker()
config = Config()
checker.check(config)
assert 'system' in checker._info
assert checker._info['system'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
@patch('health_checker.service_checker.ServiceChecker.check_services', MagicMock())
@patch('health_checker.utils.run_command')
def test_service_checker_check_by_monit(mock_run):
return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \
'Service Name Status Type\n' \
'sonic Running System\n' \
@ -74,7 +251,7 @@ def test_service_checker():
'orchagent Running Process\n' \
'root-overlay Accessible Filesystem\n' \
'var-log Is not accessible Filesystem\n'
mock_run.side_effect = ['active', return_value]
checker = ServiceChecker()
config = Config()
checker.check(config)
@ -185,6 +362,7 @@ def test_hardware_checker():
})
checker = HardwareChecker()
assert checker.get_category() == 'Hardware'
config = Config()
checker.check(config)
@ -217,3 +395,129 @@ def test_hardware_checker():
assert 'PSU 5' in checker._info
assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
def test_config():
config = Config()
config._config_file = os.path.join(test_path, Config.CONFIG_FILE)
assert config.config_file_exists()
config.load_config()
assert config.interval == 60
assert 'dummy_service' in config.ignore_services
assert 'psu.voltage' in config.ignore_devices
assert len(config.user_defined_checkers) == 0
assert config.get_led_color('fault') == 'orange'
assert config.get_led_color('normal') == 'green'
assert config.get_led_color('booting') == 'orange_blink'
assert config.get_bootup_timeout() == 300
config._reset()
assert not config.ignore_services
assert not config.ignore_devices
assert not config.user_defined_checkers
assert not config.config_data
assert config.get_led_color('fault') == 'red'
assert config.get_led_color('normal') == 'green'
assert config.get_led_color('booting') == 'orange_blink'
config._last_mtime = 1
config._config_file = 'notExistFile'
config.load_config()
assert not config._last_mtime
@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock())
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
@patch('health_checker.service_checker.ServiceChecker.check', MagicMock())
@patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock())
@patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock())
@patch('swsscommon.swsscommon.ConfigDBConnector.get_table', MagicMock())
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine'))
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
@patch('health_checker.service_checker.ServiceChecker.get_info')
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
@patch('health_checker.utils.get_uptime')
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
chassis = MagicMock()
chassis.set_status_led = MagicMock()
manager = HealthCheckerManager()
manager.config.user_defined_checkers = ['some check']
assert manager._state == HealthCheckerManager.STATE_BOOTING
assert len(manager._checkers) == 2
mock_uptime.return_value = 200
assert manager._is_system_booting()
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_BOOTING
assert len(stat) == 0
chassis.set_status_led.assert_called_with('orange_blink')
mock_uptime.return_value = 500
assert not manager._is_system_booting()
assert manager._state == HealthCheckerManager.STATE_RUNNING
mock_hw_info.return_value = {
'ASIC': {
'type': 'ASIC',
'message': '',
'status': 'OK'
},
'fan1': {
'type': 'Fan',
'message': '',
'status': 'OK'
},
}
mock_service_info.return_value = {
'snmp:snmpd': {
'type': 'Process',
'message': '',
'status': 'OK'
}
}
mock_udc_info.return_value = {
'udc': {
'type': 'Database',
'message': '',
'status': 'OK'
}
}
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_RUNNING
assert 'Services' in stat
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
assert 'Hardware' in stat
assert stat['Hardware']['ASIC']['status'] == 'OK'
assert stat['Hardware']['fan1']['status'] == 'OK'
assert 'UserDefine' in stat
assert stat['UserDefine']['udc']['status'] == 'OK'
mock_hw_info.side_effect = RuntimeError()
mock_service_info.side_effect = RuntimeError()
mock_udc_info.side_effect = RuntimeError()
state, stat = manager.check(chassis)
assert 'Internal' in stat
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
assert stat['Internal']['UserDefinedChecker - some check']['status'] == 'Not OK'
chassis.set_status_led.side_effect = NotImplementedError()
manager._set_system_led(chassis, manager.config, 'normal')
chassis.set_status_led.side_effect = RuntimeError()
manager._set_system_led(chassis, manager.config, 'normal')
def test_utils():
output = utils.run_command('some invalid command')
assert not output
output = utils.run_command('ls')
assert output
uptime = utils.get_uptime()
assert uptime > 0