[system-health] No longer check critical process/service status via monit (#9068)
HLD updated here: https://github.com/Azure/SONiC/pull/887 #### Why I did it Command `monit summary -B` can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to address that. monit is still used by system-health to do file system check as well as customize check. #### How I did it 1. Get container names from FEATURE table 2. For each container, collect critical process names from file critical_processes 3. Use “docker exec -it <container_name> bash -c ‘supervisorctl status’” to get processes status inside container, parse the output and check if any critical processes exit #### How to verify it 1. Add unit test case to cover it 2. Adjust sonic-mgmt cases to cover it 3. Manual test
This commit is contained in:
parent
240596ec7d
commit
11a93d2f92
@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
|
||||
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
|
||||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
|
||||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3)
|
||||
$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON)
|
||||
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
|
||||
|
||||
export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
||||
|
@ -1,3 +1,11 @@
|
||||
from . import utils
|
||||
from .config import Config
|
||||
from .health_checker import HealthChecker
|
||||
from .service_checker import ServiceChecker
|
||||
from .hardware_checker import HardwareChecker
|
||||
from .user_defined_checker import UserDefinedChecker
|
||||
|
||||
|
||||
class HealthCheckerManager(object):
|
||||
"""
|
||||
Manage all system health checkers and system health configuration.
|
||||
@ -10,7 +18,6 @@ class HealthCheckerManager(object):
|
||||
self._checkers = []
|
||||
self._state = self.STATE_BOOTING
|
||||
|
||||
from .config import Config
|
||||
self.config = Config()
|
||||
self.initialize()
|
||||
|
||||
@ -19,8 +26,6 @@ class HealthCheckerManager(object):
|
||||
Initialize the manager. Create service checker and hardware checker by default.
|
||||
:return:
|
||||
"""
|
||||
from .service_checker import ServiceChecker
|
||||
from .hardware_checker import HardwareChecker
|
||||
self._checkers.append(ServiceChecker())
|
||||
self._checkers.append(HardwareChecker())
|
||||
|
||||
@ -31,7 +36,6 @@ class HealthCheckerManager(object):
|
||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||
contains the status for all objects that was checked.
|
||||
"""
|
||||
from .health_checker import HealthChecker
|
||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||
stats = {}
|
||||
self.config.load_config()
|
||||
@ -45,7 +49,6 @@ class HealthCheckerManager(object):
|
||||
self._do_check(checker, stats)
|
||||
|
||||
if self.config.user_defined_checkers:
|
||||
from .user_defined_checker import UserDefinedChecker
|
||||
for udc in self.config.user_defined_checkers:
|
||||
checker = UserDefinedChecker(udc)
|
||||
self._do_check(checker, stats)
|
||||
@ -71,11 +74,12 @@ class HealthCheckerManager(object):
|
||||
else:
|
||||
stats[category].update(info)
|
||||
except Exception as e:
|
||||
from .health_checker import HealthChecker
|
||||
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
||||
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
||||
entry = {str(checker): {
|
||||
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg,
|
||||
HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal"
|
||||
}}
|
||||
if 'Internal' not in stats:
|
||||
stats['Internal'] = entry
|
||||
@ -83,8 +87,7 @@ class HealthCheckerManager(object):
|
||||
stats['Internal'].update(entry)
|
||||
|
||||
def _is_system_booting(self):
|
||||
from .utils import get_uptime
|
||||
uptime = get_uptime()
|
||||
uptime = utils.get_uptime()
|
||||
if not self.boot_timeout:
|
||||
self.boot_timeout = self.config.get_bootup_timeout()
|
||||
booting = uptime < self.boot_timeout
|
||||
|
@ -1,12 +1,31 @@
|
||||
import docker
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
|
||||
from swsscommon import swsscommon
|
||||
from sonic_py_common import multi_asic
|
||||
from sonic_py_common.logger import Logger
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
SYSLOG_IDENTIFIER = 'service_checker'
|
||||
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
|
||||
|
||||
|
||||
class ServiceChecker(HealthChecker):
|
||||
"""
|
||||
Checker that checks critical system service status via monit service.
|
||||
"""
|
||||
|
||||
# Cache file to save container_critical_processes
|
||||
CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache'
|
||||
|
||||
CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes'
|
||||
|
||||
# Command to get merged directory of a container
|
||||
GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"'
|
||||
|
||||
# Command to query the status of monit service.
|
||||
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
||||
|
||||
@ -24,6 +43,160 @@ class ServiceChecker(HealthChecker):
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
self.container_critical_processes = {}
|
||||
# Containers that has invalid critical_processes file
|
||||
self.bad_containers = set()
|
||||
|
||||
self.container_feature_dict = {}
|
||||
|
||||
self.need_save_cache = False
|
||||
|
||||
self.load_critical_process_cache()
|
||||
|
||||
def get_expected_running_containers(self, feature_table):
|
||||
"""Get a set of containers that are expected to running on SONiC
|
||||
|
||||
Args:
|
||||
feature_table (object): FEATURE table in CONFIG_DB
|
||||
|
||||
Returns:
|
||||
expected_running_containers: A set of container names that are expected running
|
||||
container_feature_dict: A dictionary {<container_name>:<feature_name>}
|
||||
"""
|
||||
expected_running_containers = set()
|
||||
container_feature_dict = {}
|
||||
for feature_name, feature_entry in feature_table.items():
|
||||
if feature_entry["state"] not in ["disabled", "always_disabled"]:
|
||||
if multi_asic.is_multi_asic():
|
||||
if feature_entry["has_global_scope"] == "True":
|
||||
expected_running_containers.add(feature_name)
|
||||
container_feature_dict[feature_name] = feature_name
|
||||
if feature_entry["has_per_asic_scope"] == "True":
|
||||
num_asics = multi_asic.get_num_asics()
|
||||
for asic_id in range(num_asics):
|
||||
expected_running_containers.add(feature_name + str(asic_id))
|
||||
container_feature_dict[feature_name + str(asic_id)] = feature_name
|
||||
else:
|
||||
expected_running_containers.add(feature_name)
|
||||
container_feature_dict[feature_name] = feature_name
|
||||
|
||||
return expected_running_containers, container_feature_dict
|
||||
|
||||
def get_current_running_containers(self):
|
||||
"""Get current running containers, if the running container is not in self.container_critical_processes,
|
||||
try get the critical process list
|
||||
|
||||
Returns:
|
||||
running_containers: A set of running container names
|
||||
"""
|
||||
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
||||
running_containers = set()
|
||||
ctrs = DOCKER_CLIENT.containers
|
||||
try:
|
||||
lst = ctrs.list(filters={"status": "running"})
|
||||
|
||||
for ctr in lst:
|
||||
running_containers.add(ctr.name)
|
||||
if ctr.name not in self.container_critical_processes:
|
||||
self.fill_critical_process_by_container(ctr.name)
|
||||
except docker.errors.APIError as err:
|
||||
logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err))
|
||||
|
||||
return running_containers
|
||||
|
||||
def get_critical_process_list_from_file(self, container, critical_processes_file):
|
||||
"""Read critical process name list from critical processes file
|
||||
|
||||
Args:
|
||||
container (str): contianer name
|
||||
critical_processes_file (str): critical processes file path
|
||||
|
||||
Returns:
|
||||
critical_process_list: A list of critical process names
|
||||
"""
|
||||
critical_process_list = []
|
||||
|
||||
with open(critical_processes_file, 'r') as file:
|
||||
for line in file:
|
||||
# Try to match a line like "program:<process_name>"
|
||||
match = re.match(r"^\s*((.+):(.*))*\s*$", line)
|
||||
if match is None:
|
||||
if container not in self.bad_containers:
|
||||
self.bad_containers.add(container)
|
||||
logger.log_error('Invalid syntax in critical_processes file of {}'.format(container))
|
||||
continue
|
||||
|
||||
identifier_key = match.group(2).strip()
|
||||
identifier_value = match.group(3).strip()
|
||||
if identifier_key == "program" and identifier_value:
|
||||
critical_process_list.append(identifier_value)
|
||||
|
||||
return critical_process_list
|
||||
|
||||
def fill_critical_process_by_container(self, container):
|
||||
"""Get critical process for a given container
|
||||
|
||||
Args:
|
||||
container (str): container name
|
||||
"""
|
||||
# Get container volumn folder
|
||||
container_folder = self._get_container_folder(container)
|
||||
if not container_folder:
|
||||
logger.log_error('Failed to get container folder for {}'.format(container_folder))
|
||||
return
|
||||
|
||||
if not os.path.exists(container_folder):
|
||||
logger.log_error('Container folder does not exist: {}'.format(container_folder))
|
||||
return
|
||||
|
||||
# Get critical_processes file path
|
||||
critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH)
|
||||
if not os.path.isfile(critical_processes_file):
|
||||
# Critical process file does not exist, the container has no critical processes.
|
||||
logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file))
|
||||
self._update_container_critical_processes(container, [])
|
||||
return
|
||||
|
||||
# Get critical process list from critical_processes
|
||||
critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file)
|
||||
self._update_container_critical_processes(container, critical_process_list)
|
||||
|
||||
def _update_container_critical_processes(self, container, critical_process_list):
|
||||
self.container_critical_processes[container] = critical_process_list
|
||||
self.need_save_cache = True
|
||||
|
||||
def _get_container_folder(self, container):
|
||||
container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container))
|
||||
if container_folder is None:
|
||||
return container_folder
|
||||
|
||||
return container_folder.strip()
|
||||
|
||||
def save_critical_process_cache(self):
|
||||
"""Save self.container_critical_processes to a cache file
|
||||
"""
|
||||
if not self.need_save_cache:
|
||||
return
|
||||
|
||||
self.need_save_cache = False
|
||||
if not self.container_critical_processes:
|
||||
# if container_critical_processes is empty, don't save it
|
||||
return
|
||||
|
||||
if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
|
||||
# if cache file exists, remove it
|
||||
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)
|
||||
|
||||
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f:
|
||||
pickle.dump(self.container_critical_processes, f)
|
||||
|
||||
def load_critical_process_cache(self):
|
||||
if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE):
|
||||
# cache file does not exist
|
||||
return
|
||||
|
||||
with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f:
|
||||
self.container_critical_processes = pickle.load(f)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
@ -31,16 +204,14 @@ class ServiceChecker(HealthChecker):
|
||||
def get_category(self):
|
||||
return 'Services'
|
||||
|
||||
def check(self, config):
|
||||
def check_by_monit(self, config):
|
||||
"""
|
||||
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
||||
process and file system.
|
||||
et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
||||
if output != 'active':
|
||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD)
|
||||
if not output or output.strip() != 'active':
|
||||
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
||||
return
|
||||
|
||||
@ -58,7 +229,7 @@ class ServiceChecker(HealthChecker):
|
||||
|
||||
for line in lines[2:]:
|
||||
name = line[0:status_begin].strip()
|
||||
if config.ignore_services and name in config.ignore_services:
|
||||
if config and config.ignore_services and name in config.ignore_services:
|
||||
continue
|
||||
status = line[status_begin:type_begin].strip()
|
||||
service_type = line[type_begin:].strip()
|
||||
@ -70,3 +241,105 @@ class ServiceChecker(HealthChecker):
|
||||
else:
|
||||
self.set_object_ok(service_type, name)
|
||||
return
|
||||
|
||||
def check_services(self, config):
|
||||
"""Check status of critical services and critical processes
|
||||
|
||||
Args:
|
||||
config (config.Config): Health checker configuration.
|
||||
"""
|
||||
config_db = swsscommon.ConfigDBConnector()
|
||||
config_db.connect()
|
||||
feature_table = config_db.get_table("FEATURE")
|
||||
expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table)
|
||||
current_running_containers = self.get_current_running_containers()
|
||||
|
||||
newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers)
|
||||
for newly_disabled_container in newly_disabled_containers:
|
||||
self.container_critical_processes.pop(newly_disabled_container)
|
||||
|
||||
self.save_critical_process_cache()
|
||||
|
||||
not_running_containers = expected_running_containers.difference(current_running_containers)
|
||||
for container in not_running_containers:
|
||||
self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container))
|
||||
|
||||
if not self.container_critical_processes:
|
||||
# Critical process is empty, not expect
|
||||
self.set_object_not_ok('Service', 'system', 'no critical process found')
|
||||
return
|
||||
|
||||
for container, critical_process_list in self.container_critical_processes.items():
|
||||
self.check_process_existence(container, critical_process_list, config, feature_table)
|
||||
|
||||
for bad_container in self.bad_containers:
|
||||
self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect')
|
||||
|
||||
def check(self, config):
|
||||
"""Check critical system service status.
|
||||
|
||||
Args:
|
||||
config (object): Health checker configuration.
|
||||
"""
|
||||
self.reset()
|
||||
self.check_by_monit(config)
|
||||
self.check_services(config)
|
||||
|
||||
|
||||
def _parse_supervisorctl_status(self, process_status):
|
||||
"""Expected input:
|
||||
arp_update RUNNING pid 67, uptime 1:03:56
|
||||
buffermgrd RUNNING pid 81, uptime 1:03:56
|
||||
|
||||
Args:
|
||||
process_status (list): List of process status
|
||||
"""
|
||||
data = {}
|
||||
for line in process_status:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
items = line.split()
|
||||
if len(items) < 2:
|
||||
continue
|
||||
data[items[0].strip()] = items[1].strip()
|
||||
return data
|
||||
|
||||
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
|
||||
"""Check whether the process in the specified container is running or not.
|
||||
|
||||
Args:
|
||||
container_name (str): Container name
|
||||
critical_process_list (list): Critical processes
|
||||
config (object): Health checker configuration.
|
||||
feature_table (object): Feature table
|
||||
"""
|
||||
feature_name = self.container_feature_dict[container_name]
|
||||
if feature_name in feature_table:
|
||||
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
|
||||
# If the container is diabled, we exit.
|
||||
if ("state" in feature_table[feature_name]
|
||||
and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]):
|
||||
|
||||
# We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because
|
||||
# it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent",
|
||||
# however, in supervisor.conf it is "/usr/bin/orchagent.sh"
|
||||
cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name)
|
||||
process_status = utils.run_command(cmd)
|
||||
if process_status is None:
|
||||
for process_name in critical_process_list:
|
||||
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
|
||||
return
|
||||
|
||||
process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
|
||||
for process_name in critical_process_list:
|
||||
if config and config.ignore_services and process_name in config.ignore_services:
|
||||
continue
|
||||
|
||||
# Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container.
|
||||
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
|
||||
if process_name in process_status:
|
||||
if process_status[process_name] != 'RUNNING':
|
||||
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
|
||||
else:
|
||||
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))
|
||||
|
@ -3,6 +3,7 @@ from setuptools import setup
|
||||
dependencies = [
|
||||
'natsort',
|
||||
'sonic_py_common',
|
||||
'docker'
|
||||
]
|
||||
|
||||
setup(
|
||||
|
@ -0,0 +1,2 @@
|
||||
program:snmpd
|
||||
program:snmp-subagent
|
11
src/system-health/tests/system_health_monitoring_config.json
Normal file
11
src/system-health/tests/system_health_monitoring_config.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": ["dummy_service"],
|
||||
"devices_to_ignore": ["psu.voltage"],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
@ -8,6 +8,7 @@
|
||||
2. HealthCheckerManager
|
||||
3. Config
|
||||
"""
|
||||
import copy
|
||||
import os
|
||||
import sys
|
||||
from swsscommon import swsscommon
|
||||
@ -30,11 +31,21 @@ from health_checker.manager import HealthCheckerManager
|
||||
from health_checker.service_checker import ServiceChecker
|
||||
from health_checker.user_defined_checker import UserDefinedChecker
|
||||
|
||||
mock_supervisorctl_output = """
|
||||
snmpd RUNNING pid 67, uptime 1:03:56
|
||||
snmp-subagent EXITED Oct 19 01:53 AM
|
||||
"""
|
||||
device_info.get_platform = MagicMock(return_value='unittest')
|
||||
|
||||
|
||||
def test_user_defined_checker():
|
||||
utils.run_command = MagicMock(return_value='')
|
||||
def setup():
|
||||
if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
|
||||
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)
|
||||
|
||||
|
||||
@patch('health_checker.utils.run_command')
|
||||
def test_user_defined_checker(mock_run):
|
||||
mock_run.return_value = ''
|
||||
|
||||
checker = UserDefinedChecker('')
|
||||
checker.check(None)
|
||||
@ -43,29 +54,195 @@ def test_user_defined_checker():
|
||||
checker.reset()
|
||||
assert len(checker._info) == 0
|
||||
|
||||
utils.run_command = MagicMock(return_value='\n\n\n')
|
||||
mock_run.return_value = '\n\n\n'
|
||||
checker.check(None)
|
||||
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n'
|
||||
utils.run_command = MagicMock(return_value=valid_output)
|
||||
mock_run.return_value = valid_output
|
||||
checker.check(None)
|
||||
assert checker.get_category() == 'MyCategory'
|
||||
assert 'Device1' in checker._info
|
||||
assert 'Device2' in checker._info
|
||||
assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
|
||||
def test_service_checker():
|
||||
return_value = ''
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
|
||||
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
|
||||
@patch('docker.DockerClient')
|
||||
@patch('health_checker.utils.run_command')
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector')
|
||||
def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_client):
|
||||
mock_db_data = MagicMock()
|
||||
mock_get_table = MagicMock()
|
||||
mock_db_data.get_table = mock_get_table
|
||||
mock_config_db.return_value = mock_db_data
|
||||
mock_get_table.return_value = {
|
||||
'snmp': {
|
||||
'state': 'enabled',
|
||||
'has_global_scope': 'True',
|
||||
'has_per_asic_scope': 'False',
|
||||
|
||||
def mock_run_command(cmd):
|
||||
if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD:
|
||||
return 'active'
|
||||
else:
|
||||
return return_value
|
||||
}
|
||||
}
|
||||
mock_containers = MagicMock()
|
||||
mock_snmp_container = MagicMock()
|
||||
mock_snmp_container.name = 'snmp'
|
||||
mock_containers.list = MagicMock(return_value=[mock_snmp_container])
|
||||
mock_docker_client_object = MagicMock()
|
||||
mock_docker_client.return_value = mock_docker_client_object
|
||||
mock_docker_client_object.containers = mock_containers
|
||||
|
||||
utils.run_command = mock_run_command
|
||||
mock_run.return_value = mock_supervisorctl_output
|
||||
|
||||
checker = ServiceChecker()
|
||||
assert checker.get_category() == 'Services'
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
assert 'snmp:snmpd' in checker._info
|
||||
assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'snmp:snmp-subagent' in checker._info
|
||||
assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
mock_get_table.return_value = {
|
||||
'new_service': {
|
||||
'state': 'enabled',
|
||||
'has_global_scope': 'True',
|
||||
'has_per_asic_scope': 'False',
|
||||
},
|
||||
'snmp': {
|
||||
'state': 'enabled',
|
||||
'has_global_scope': 'True',
|
||||
'has_per_asic_scope': 'False',
|
||||
|
||||
}
|
||||
}
|
||||
mock_ns_container = MagicMock()
|
||||
mock_ns_container.name = 'new_service'
|
||||
mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container])
|
||||
checker.check(config)
|
||||
assert 'new_service' in checker.container_critical_processes
|
||||
|
||||
assert 'new_service:snmpd' in checker._info
|
||||
assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'new_service:snmp-subagent' in checker._info
|
||||
assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
mock_containers.list = MagicMock(return_value=[mock_snmp_container])
|
||||
checker.check(config)
|
||||
assert 'new_service' in checker._info
|
||||
assert checker._info['new_service'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container])
|
||||
mock_run.return_value = None
|
||||
checker.check(config)
|
||||
assert 'new_service:snmpd' in checker._info
|
||||
assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'new_service:snmp-subagent' in checker._info
|
||||
assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
origin_container_critical_processes = copy.deepcopy(checker.container_critical_processes)
|
||||
checker.save_critical_process_cache()
|
||||
checker.load_critical_process_cache()
|
||||
assert origin_container_critical_processes == checker.container_critical_processes
|
||||
|
||||
|
||||
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
|
||||
@patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output))
|
||||
@patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3))
|
||||
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=True))
|
||||
@patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)]))
|
||||
@patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value=''))
|
||||
@patch('docker.DockerClient')
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector')
|
||||
def test_service_checker_multi_asic(mock_config_db, mock_docker_client):
|
||||
mock_db_data = MagicMock()
|
||||
mock_db_data.get_table = MagicMock()
|
||||
mock_config_db.return_value = mock_db_data
|
||||
|
||||
mock_db_data.get_table.return_value = {
|
||||
'snmp': {
|
||||
'state': 'enabled',
|
||||
'has_global_scope': 'True',
|
||||
'has_per_asic_scope': 'True',
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
mock_containers = MagicMock()
|
||||
mock_snmp_container = MagicMock()
|
||||
mock_snmp_container.name = 'snmp'
|
||||
list_return_value = [mock_snmp_container]
|
||||
for i in range(3):
|
||||
mock_container = MagicMock()
|
||||
mock_container.name = 'snmp' + str(i)
|
||||
list_return_value.append(mock_container)
|
||||
|
||||
mock_containers.list = MagicMock(return_value=list_return_value)
|
||||
mock_docker_client_object = MagicMock()
|
||||
mock_docker_client.return_value = mock_docker_client_object
|
||||
mock_docker_client_object.containers = mock_containers
|
||||
|
||||
checker = ServiceChecker()
|
||||
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
assert 'snmp' in checker.container_critical_processes
|
||||
assert 'snmp:snmpd' in checker._info
|
||||
assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
assert 'snmp0:snmpd' in checker._info
|
||||
assert checker._info['snmp0:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
assert 'snmp1:snmpd' in checker._info
|
||||
assert checker._info['snmp1:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
assert 'snmp2:snmpd' in checker._info
|
||||
assert checker._info['snmp2:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'snmp:snmp-subagent' in checker._info
|
||||
assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
assert 'snmp0:snmp-subagent' in checker._info
|
||||
assert checker._info['snmp0:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
assert 'snmp1:snmp-subagent' in checker._info
|
||||
assert checker._info['snmp1:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
assert 'snmp2:snmp-subagent' in checker._info
|
||||
assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock())
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||
@patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock())
|
||||
@patch('docker.DockerClient')
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.get_table')
|
||||
def test_service_checker_no_critical_process(mock_get_table, mock_docker_client):
|
||||
mock_get_table.return_value = {
|
||||
'snmp': {
|
||||
'state': 'enabled',
|
||||
'has_global_scope': 'True',
|
||||
'has_per_asic_scope': 'True',
|
||||
|
||||
}
|
||||
}
|
||||
mock_containers = MagicMock()
|
||||
mock_containers.list = MagicMock(return_value=[])
|
||||
mock_docker_client_object = MagicMock()
|
||||
mock_docker_client.return_value = mock_docker_client_object
|
||||
mock_docker_client_object.containers = mock_containers
|
||||
|
||||
checker = ServiceChecker()
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
assert 'system' in checker._info
|
||||
assert checker._info['system'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
@patch('health_checker.service_checker.ServiceChecker.check_services', MagicMock())
|
||||
@patch('health_checker.utils.run_command')
|
||||
def test_service_checker_check_by_monit(mock_run):
|
||||
return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \
|
||||
'Service Name Status Type\n' \
|
||||
'sonic Running System\n' \
|
||||
@ -74,7 +251,7 @@ def test_service_checker():
|
||||
'orchagent Running Process\n' \
|
||||
'root-overlay Accessible Filesystem\n' \
|
||||
'var-log Is not accessible Filesystem\n'
|
||||
|
||||
mock_run.side_effect = ['active', return_value]
|
||||
checker = ServiceChecker()
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
@ -185,6 +362,7 @@ def test_hardware_checker():
|
||||
})
|
||||
|
||||
checker = HardwareChecker()
|
||||
assert checker.get_category() == 'Hardware'
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
|
||||
@ -217,3 +395,129 @@ def test_hardware_checker():
|
||||
|
||||
assert 'PSU 5' in checker._info
|
||||
assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
|
||||
def test_config():
|
||||
config = Config()
|
||||
config._config_file = os.path.join(test_path, Config.CONFIG_FILE)
|
||||
|
||||
assert config.config_file_exists()
|
||||
config.load_config()
|
||||
assert config.interval == 60
|
||||
assert 'dummy_service' in config.ignore_services
|
||||
assert 'psu.voltage' in config.ignore_devices
|
||||
assert len(config.user_defined_checkers) == 0
|
||||
|
||||
assert config.get_led_color('fault') == 'orange'
|
||||
assert config.get_led_color('normal') == 'green'
|
||||
assert config.get_led_color('booting') == 'orange_blink'
|
||||
assert config.get_bootup_timeout() == 300
|
||||
|
||||
config._reset()
|
||||
assert not config.ignore_services
|
||||
assert not config.ignore_devices
|
||||
assert not config.user_defined_checkers
|
||||
assert not config.config_data
|
||||
|
||||
assert config.get_led_color('fault') == 'red'
|
||||
assert config.get_led_color('normal') == 'green'
|
||||
assert config.get_led_color('booting') == 'orange_blink'
|
||||
|
||||
config._last_mtime = 1
|
||||
config._config_file = 'notExistFile'
|
||||
config.load_config()
|
||||
assert not config._last_mtime
|
||||
|
||||
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock())
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||
@patch('health_checker.service_checker.ServiceChecker.check', MagicMock())
|
||||
@patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock())
|
||||
@patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock())
|
||||
@patch('swsscommon.swsscommon.ConfigDBConnector.get_table', MagicMock())
|
||||
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine'))
|
||||
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
|
||||
@patch('health_checker.service_checker.ServiceChecker.get_info')
|
||||
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
|
||||
@patch('health_checker.utils.get_uptime')
|
||||
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
||||
chassis = MagicMock()
|
||||
chassis.set_status_led = MagicMock()
|
||||
|
||||
manager = HealthCheckerManager()
|
||||
manager.config.user_defined_checkers = ['some check']
|
||||
assert manager._state == HealthCheckerManager.STATE_BOOTING
|
||||
assert len(manager._checkers) == 2
|
||||
|
||||
mock_uptime.return_value = 200
|
||||
assert manager._is_system_booting()
|
||||
state, stat = manager.check(chassis)
|
||||
assert state == HealthCheckerManager.STATE_BOOTING
|
||||
assert len(stat) == 0
|
||||
chassis.set_status_led.assert_called_with('orange_blink')
|
||||
|
||||
mock_uptime.return_value = 500
|
||||
assert not manager._is_system_booting()
|
||||
assert manager._state == HealthCheckerManager.STATE_RUNNING
|
||||
mock_hw_info.return_value = {
|
||||
'ASIC': {
|
||||
'type': 'ASIC',
|
||||
'message': '',
|
||||
'status': 'OK'
|
||||
},
|
||||
'fan1': {
|
||||
'type': 'Fan',
|
||||
'message': '',
|
||||
'status': 'OK'
|
||||
},
|
||||
}
|
||||
mock_service_info.return_value = {
|
||||
'snmp:snmpd': {
|
||||
'type': 'Process',
|
||||
'message': '',
|
||||
'status': 'OK'
|
||||
}
|
||||
}
|
||||
mock_udc_info.return_value = {
|
||||
'udc': {
|
||||
'type': 'Database',
|
||||
'message': '',
|
||||
'status': 'OK'
|
||||
}
|
||||
}
|
||||
state, stat = manager.check(chassis)
|
||||
assert state == HealthCheckerManager.STATE_RUNNING
|
||||
assert 'Services' in stat
|
||||
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
|
||||
|
||||
assert 'Hardware' in stat
|
||||
assert stat['Hardware']['ASIC']['status'] == 'OK'
|
||||
assert stat['Hardware']['fan1']['status'] == 'OK'
|
||||
|
||||
assert 'UserDefine' in stat
|
||||
assert stat['UserDefine']['udc']['status'] == 'OK'
|
||||
|
||||
mock_hw_info.side_effect = RuntimeError()
|
||||
mock_service_info.side_effect = RuntimeError()
|
||||
mock_udc_info.side_effect = RuntimeError()
|
||||
state, stat = manager.check(chassis)
|
||||
assert 'Internal' in stat
|
||||
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
|
||||
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
|
||||
assert stat['Internal']['UserDefinedChecker - some check']['status'] == 'Not OK'
|
||||
|
||||
chassis.set_status_led.side_effect = NotImplementedError()
|
||||
manager._set_system_led(chassis, manager.config, 'normal')
|
||||
|
||||
chassis.set_status_led.side_effect = RuntimeError()
|
||||
manager._set_system_led(chassis, manager.config, 'normal')
|
||||
|
||||
def test_utils():
|
||||
output = utils.run_command('some invalid command')
|
||||
assert not output
|
||||
|
||||
output = utils.run_command('ls')
|
||||
assert output
|
||||
|
||||
uptime = utils.get_uptime()
|
||||
assert uptime > 0
|
||||
|
Loading…
Reference in New Issue
Block a user