[system-health] Add support for monitoring system health (#4835)
* system health first commit * system health daemon first commit * Finish healthd * Changes due to lower layer logic change * Get ASIC temperature from TEMPERATURE_INFO table * Add system health make rule and service files * fix bugs found during manual test * Change make file to install system-health library to host * Set system LED to blink on bootup time * Caught exceptions in system health checker to make it more robust * fix issue that fan/psu presence will always be true * fix issue for external checker * move system-health service to right after rc-local service * Set system-health service start after database service * Get system up time via /proc/uptime * Provide more information in stat for CLI to use * fix typo * Set default category to External for external checker * If external checker reported OK, save it to stat too * Trim string for external checker output * fix issue: PSU voltage check always return OK * Add unit test cases for system health library * Fix LGTM warnings * fix demo comments: 1. get boot up timeout from monit configuration file; 2. set system led in library instead of daemon * Remove boot_timeout configuration because it will get from monit config file * Fix argument miss * fix unit test failure * fix issue: summary status is not correct * Fix format issues found in code review * rename th to threshold to make it clearer * Fix review comment: 1. add a .dep file for system health; 2. deprecated daemon_base and uses sonic-py-common instead * Fix unit test failure * Fix LGTM alert * Fix LGTM alert * Fix review comments * Fix review comment * 1. Add relevant comments for system health; 2. rename external_checker to user_define_checker * Ignore check for unknown service type * Fix unit test issue * Rename user define checker to user defined checker * Rename user_define_checkers to user_defined_checkers for configuration file * Renmae file user_define_checker.py -> user_defined_checker.py * Fix typo * Adjust import order for config.py Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import order for src/system-health/health_checker/hardware_checker.py Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import order for src/system-health/scripts/healthd Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import orders in src/system-health/tests/test_system_health.py * Fix typo * Add new line after import * If system health configuration file not exist, healthd should exit * Fix indent and enable pytest coverage * Fix typo * Fix typo * Remove global logger and use log functions inherited from super class * Change info level logger to notice level Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>
This commit is contained in:
parent
8e0e316cf8
commit
1c97a03b81
@ -1,11 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": ["psu.voltage", "psu.temperature"],
|
||||
"external_checkers": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": ["psu.voltage"],
|
||||
"external_checkers": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": ["psu","asic","fan"],
|
||||
"external_checkers": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
"fault": "orange",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
||||
|
@ -172,6 +172,12 @@ sudo cp {{platform_common_py2_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $PLATFORM_COMMON_PY2_WHEEL_NAME
|
||||
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2_WHEEL_NAME
|
||||
|
||||
# Install system-health Python 2 package
|
||||
SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}})
|
||||
sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
|
||||
# Install sonic-platform-common Python 3 package
|
||||
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
|
||||
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
|
||||
@ -283,6 +289,10 @@ sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d
|
||||
sudo cp $IMAGE_CONFIGS/syslog/override.conf $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d/override.conf
|
||||
sudo cp $IMAGE_CONFIGS/syslog/host_umount.sh $FILESYSTEM_ROOT/usr/bin/
|
||||
|
||||
# Copy system-health files
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/system-health/system-health.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
||||
echo "system-health.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
||||
|
||||
# Copy logrotate.d configuration files
|
||||
sudo cp -f $IMAGE_CONFIGS/logrotate/logrotate.d/* $FILESYSTEM_ROOT/etc/logrotate.d/
|
||||
|
||||
|
11
files/image_config/system-health/system-health.service
Normal file
11
files/image_config/system-health/system-health.service
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=SONiC system health monitor
|
||||
Requires=database.service updategraph.service
|
||||
After=database.service updategraph.service
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/local/bin/healthd
|
||||
Restart=always
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
8
rules/system-health.dep
Normal file
8
rules/system-health.dep
Normal file
@ -0,0 +1,8 @@
|
||||
SPATH := $($(SYSTEM_HEALTH)_SRC_PATH)
|
||||
DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/system-health.mk rules/system-health.dep
|
||||
DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST)
|
||||
DEP_FILES += $(shell git ls-files $(SPATH))
|
||||
|
||||
$(SYSTEM_HEALTH)_CACHE_MODE := GIT_CONTENT_SHA
|
||||
$(SYSTEM_HEALTH)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST)
|
||||
$(SYSTEM_HEALTH)_DEP_FILES := $(DEP_FILES)
|
9
rules/system-health.mk
Normal file
9
rules/system-health.mk
Normal file
@ -0,0 +1,9 @@
|
||||
# system health python2 wheel
|
||||
|
||||
SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl
|
||||
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
|
||||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2
|
||||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE)
|
||||
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
|
||||
|
||||
export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
3
slave.mk
3
slave.mk
@ -819,7 +819,8 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(REDIS_DUMP_LOAD_PY2)) \
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2)) \
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY))
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) \
|
||||
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))
|
||||
$(HEADER)
|
||||
# Pass initramfs and linux kernel explicitly. They are used for all platforms
|
||||
export debs_path="$(IMAGE_DISTRO_DEBS_PATH)"
|
||||
|
8
src/system-health/.gitignore
vendored
Normal file
8
src/system-health/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
*/deb_dist/
|
||||
*/dist/
|
||||
*/build/
|
||||
*/*.tar.gz
|
||||
*/*.egg-info
|
||||
*/.cache/
|
||||
*.pyc
|
||||
*/__pycache__/
|
2
src/system-health/health_checker/__init__.py
Normal file
2
src/system-health/health_checker/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from . import hardware_checker
|
||||
from . import service_checker
|
144
src/system-health/health_checker/config.py
Normal file
144
src/system-health/health_checker/config.py
Normal file
@ -0,0 +1,144 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from sonic_py_common import device_info
|
||||
|
||||
|
||||
class Config(object):
|
||||
"""
|
||||
Manage configuration of system health.
|
||||
"""
|
||||
|
||||
# Default system health check interval
|
||||
DEFAULT_INTERVAL = 60
|
||||
|
||||
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
|
||||
DEFAULT_BOOTUP_TIMEOUT = 300
|
||||
|
||||
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
|
||||
# override the default behavior.
|
||||
DEFAULT_LED_CONFIG = {
|
||||
'fault': 'red',
|
||||
'normal': 'green',
|
||||
'booting': 'orange_blink'
|
||||
}
|
||||
|
||||
# System health configuration file name
|
||||
CONFIG_FILE = 'system_health_monitoring_config.json'
|
||||
|
||||
# Monit service configuration file path
|
||||
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
|
||||
|
||||
# Monit service start delay configuration entry
|
||||
MONIT_START_DELAY_CONFIG = 'with start delay'
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
|
||||
"""
|
||||
self.platform_name = device_info.get_platform()
|
||||
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def config_file_exists(self):
|
||||
return os.path.exists(self._config_file)
|
||||
|
||||
def load_config(self):
|
||||
"""
|
||||
Load the configuration file from disk.
|
||||
1. If there is no configuration file, current config entries will reset to default value
|
||||
2. Only read the configuration file is last_mtime changes for better performance
|
||||
3. If there is any format issues in configuration file, current config entries will reset to default value
|
||||
:return:
|
||||
"""
|
||||
if not self.config_file_exists():
|
||||
if self._last_mtime is not None:
|
||||
self._reset()
|
||||
return
|
||||
|
||||
mtime = os.stat(self._config_file)
|
||||
if mtime != self._last_mtime:
|
||||
try:
|
||||
self._last_mtime = mtime
|
||||
with open(self._config_file, 'r') as f:
|
||||
self.config_data = json.load(f)
|
||||
|
||||
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
|
||||
self.ignore_services = self._get_list_data('services_to_ignore')
|
||||
self.ignore_devices = self._get_list_data('devices_to_ignore')
|
||||
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
|
||||
except Exception as e:
|
||||
self._reset()
|
||||
|
||||
def _reset(self):
|
||||
"""
|
||||
Reset current configuration entry to default value
|
||||
:return:
|
||||
"""
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def get_led_color(self, status):
|
||||
"""
|
||||
Get desired LED color according to the input status
|
||||
:param status: System health status
|
||||
:return: StringLED color
|
||||
"""
|
||||
if self.config_data and 'led_color' in self.config_data:
|
||||
if status in self.config_data['led_color']:
|
||||
return self.config_data['led_color'][status]
|
||||
|
||||
return self.DEFAULT_LED_CONFIG[status]
|
||||
|
||||
def get_bootup_timeout(self):
|
||||
"""
|
||||
Get boot up timeout from monit configuration file.
|
||||
1. If monit configuration file does not exist, return default value
|
||||
2. If there is any exception while parsing monit config, return default value
|
||||
:return: Integer timeout value
|
||||
"""
|
||||
if not os.path.exists(Config.MONIT_CONFIG_FILE):
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
try:
|
||||
with open(Config.MONIT_CONFIG_FILE) as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
pos = line.find('#')
|
||||
if pos == 0:
|
||||
continue
|
||||
|
||||
line = line[:pos]
|
||||
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
|
||||
if pos != -1:
|
||||
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
|
||||
except Exception:
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
def _get_list_data(self, key):
|
||||
"""
|
||||
Get list type configuration data by key and remove duplicate element.
|
||||
:param key: Key of the configuration entry
|
||||
:return: A set of configuration data if key exists
|
||||
"""
|
||||
if key in self.config_data:
|
||||
data = self.config_data[key]
|
||||
if isinstance(data, list):
|
||||
return set(data)
|
||||
return None
|
248
src/system-health/health_checker/hardware_checker.py
Normal file
248
src/system-health/health_checker/hardware_checker.py
Normal file
@ -0,0 +1,248 @@
|
||||
from natsort import natsorted
|
||||
from swsssdk import SonicV2Connector
|
||||
|
||||
from .health_checker import HealthChecker
|
||||
|
||||
|
||||
class HardwareChecker(HealthChecker):
|
||||
"""
|
||||
Check system hardware status. For now, it checks ASIC, PSU and fan status.
|
||||
"""
|
||||
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
|
||||
FAN_TABLE_NAME = 'FAN_INFO'
|
||||
PSU_TABLE_NAME = 'PSU_INFO'
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
self._db = SonicV2Connector(host="127.0.0.1")
|
||||
self._db.connect(self._db.STATE_DB)
|
||||
|
||||
def get_category(self):
|
||||
return 'Hardware'
|
||||
|
||||
def check(self, config):
|
||||
self.reset()
|
||||
self._check_asic_status(config)
|
||||
self._check_fan_status(config)
|
||||
self._check_psu_status(config)
|
||||
|
||||
def _check_asic_status(self, config):
|
||||
"""
|
||||
Check if ASIC temperature is in valid range.
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'asic' in config.ignore_devices:
|
||||
return
|
||||
|
||||
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
|
||||
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
|
||||
if not temperature:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
|
||||
elif not temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
|
||||
temperature,
|
||||
temperature_threshold))
|
||||
else:
|
||||
self.set_object_ok('ASIC', 'ASIC')
|
||||
except ValueError as e:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
|
||||
temperature_threshold))
|
||||
|
||||
def _check_fan_status(self, config):
|
||||
"""
|
||||
Check fan status including:
|
||||
1. Check all fans are present
|
||||
2. Check all fans are in good state
|
||||
3. Check fan speed is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'fan' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
||||
speed = data_dict.get('speed', None)
|
||||
speed_target = data_dict.get('speed_target', None)
|
||||
speed_tolerance = data_dict.get('speed_tolerance', None)
|
||||
if not speed:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
||||
continue
|
||||
elif not speed_target:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
||||
continue
|
||||
elif not speed_tolerance:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
speed = float(speed)
|
||||
speed_target = float(speed_target)
|
||||
speed_tolerance = float(speed_tolerance)
|
||||
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
||||
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
||||
if speed < speed_min_th or speed > speed_max_th:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
||||
speed,
|
||||
speed_min_th,
|
||||
speed_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
||||
name,
|
||||
speed,
|
||||
speed_target,
|
||||
speed_tolerance))
|
||||
continue
|
||||
|
||||
self.set_object_ok('Fan', name)
|
||||
|
||||
def _check_psu_status(self, config):
|
||||
"""
|
||||
Check PSU status including:
|
||||
1. Check all PSUs are present
|
||||
2. Check all PSUs are power on
|
||||
3. Check PSU temperature is in valid range
|
||||
4. Check PSU voltage is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'psu' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
|
||||
temperature = data_dict.get('temp', None)
|
||||
temperature_threshold = data_dict.get('temp_threshold', None)
|
||||
if temperature is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
|
||||
continue
|
||||
elif temperature_threshold is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} temperature is too hot, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
|
||||
voltage = data_dict.get('voltage', None)
|
||||
voltage_min_th = data_dict.get('voltage_min_threshold', None)
|
||||
voltage_max_th = data_dict.get('voltage_max_threshold', None)
|
||||
if voltage is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_min_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage minimum threshold data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_max_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage maximum threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
voltage = float(voltage)
|
||||
voltage_min_th = float(voltage_min_th)
|
||||
voltage_max_th = float(voltage_max_th)
|
||||
if voltage < voltage_min_th or voltage > voltage_max_th:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
self.set_object_ok('PSU', name)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
@classmethod
|
||||
def _ignore_check(cls, ignore_set, category, object_name, check_point):
|
||||
if not ignore_set:
|
||||
return False
|
||||
|
||||
if '{}.{}'.format(category, check_point) in ignore_set:
|
||||
return True
|
||||
elif '{}.{}'.format(object_name, check_point) in ignore_set:
|
||||
return True
|
||||
return False
|
86
src/system-health/health_checker/health_checker.py
Normal file
86
src/system-health/health_checker/health_checker.py
Normal file
@ -0,0 +1,86 @@
|
||||
class HealthChecker(object):
|
||||
"""
|
||||
Base class for health checker. A checker is an object that performs system health check for a particular category,
|
||||
it collects and stores information after the check.
|
||||
"""
|
||||
INFO_FIELD_OBJECT_TYPE = 'type'
|
||||
INFO_FIELD_OBJECT_STATUS = 'status'
|
||||
INFO_FIELD_OBJECT_MSG = 'message'
|
||||
|
||||
STATUS_OK = 'OK'
|
||||
STATUS_NOT_OK = 'Not OK'
|
||||
|
||||
summary = STATUS_OK
|
||||
|
||||
def __init__(self):
|
||||
self._info = {}
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the status of the checker. Called every time before the check.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_category(self):
|
||||
"""
|
||||
Get category of the checker.
|
||||
:return: String category
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_info(self):
|
||||
"""
|
||||
Get information of the checker. A checker usually checks a few objects and each object status will be put to
|
||||
self._info.
|
||||
:return: Check result.
|
||||
"""
|
||||
return self._info
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Perform the check.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def add_info(self, object_name, key, value):
|
||||
"""
|
||||
Add check result for an object.
|
||||
:param object_name: Object name.
|
||||
:param key: Object attribute name.
|
||||
:param value: Object attribute value.
|
||||
:return:
|
||||
"""
|
||||
if object_name not in self._info:
|
||||
self._info[object_name] = {}
|
||||
|
||||
self._info[object_name][key] = value
|
||||
|
||||
def set_object_not_ok(self, object_type, object_name, message):
|
||||
"""
|
||||
Set that an object is not OK.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:param message: A message to describe what is wrong with the object.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
|
||||
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
||||
|
||||
def set_object_ok(self, object_type, object_name):
|
||||
"""
|
||||
Set that an object is in good state.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)
|
101
src/system-health/health_checker/manager.py
Normal file
101
src/system-health/health_checker/manager.py
Normal file
@ -0,0 +1,101 @@
|
||||
class HealthCheckerManager(object):
|
||||
"""
|
||||
Manage all system health checkers and system health configuration.
|
||||
"""
|
||||
STATE_BOOTING = 'booting'
|
||||
STATE_RUNNING = 'running'
|
||||
boot_timeout = None
|
||||
|
||||
def __init__(self):
|
||||
self._checkers = []
|
||||
self._state = self.STATE_BOOTING
|
||||
|
||||
from .config import Config
|
||||
self.config = Config()
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize the manager. Create service checker and hardware checker by default.
|
||||
:return:
|
||||
"""
|
||||
from .service_checker import ServiceChecker
|
||||
from .hardware_checker import HardwareChecker
|
||||
self._checkers.append(ServiceChecker())
|
||||
self._checkers.append(HardwareChecker())
|
||||
|
||||
def check(self, chassis):
|
||||
"""
|
||||
Load new configuration if any and perform the system health check for all existing checkers.
|
||||
:param chassis: A chassis object.
|
||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||
contains the status for all objects that was checked.
|
||||
"""
|
||||
from .health_checker import HealthChecker
|
||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||
stats = {}
|
||||
self.config.load_config()
|
||||
# check state first to avoid user change boot timeout in configuration file
|
||||
# after finishing system boot
|
||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
||||
self._set_system_led(chassis, self.config, 'booting')
|
||||
return self._state, stats
|
||||
|
||||
for checker in self._checkers:
|
||||
self._do_check(checker, stats)
|
||||
|
||||
if self.config.user_defined_checkers:
|
||||
from .user_defined_checker import UserDefinedChecker
|
||||
for udc in self.config.user_defined_checkers:
|
||||
checker = UserDefinedChecker(udc)
|
||||
self._do_check(checker, stats)
|
||||
|
||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||
self._set_system_led(chassis, self.config, led_status)
|
||||
|
||||
return self._state, stats
|
||||
|
||||
def _do_check(self, checker, stats):
|
||||
"""
|
||||
Do check for a particular checker and collect the check statistic.
|
||||
:param checker: A checker object.
|
||||
:param stats: Check statistic.
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
checker.check(self.config)
|
||||
category = checker.get_category()
|
||||
info = checker.get_info()
|
||||
if category not in stats:
|
||||
stats[category] = info
|
||||
else:
|
||||
stats[category].update(info)
|
||||
except Exception as e:
|
||||
from .health_checker import HealthChecker
|
||||
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
||||
entry = {str(checker): {
|
||||
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
||||
}}
|
||||
if 'Internal' not in stats:
|
||||
stats['Internal'] = entry
|
||||
else:
|
||||
stats['Internal'].update(entry)
|
||||
|
||||
def _is_system_booting(self):
|
||||
from .utils import get_uptime
|
||||
uptime = get_uptime()
|
||||
if not self.boot_timeout:
|
||||
self.boot_timeout = self.config.get_bootup_timeout()
|
||||
booting = uptime < self.boot_timeout
|
||||
if not booting:
|
||||
self._state = self.STATE_RUNNING
|
||||
return booting
|
||||
|
||||
def _set_system_led(self, chassis, config, status):
|
||||
try:
|
||||
chassis.set_status_led(config.get_led_color(status))
|
||||
except NotImplementedError:
|
||||
print('chassis.set_status_led is not implemented')
|
||||
except Exception as e:
|
||||
print('Failed to set system led due to - {}'.format(repr(e)))
|
72
src/system-health/health_checker/service_checker.py
Normal file
72
src/system-health/health_checker/service_checker.py
Normal file
@ -0,0 +1,72 @@
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class ServiceChecker(HealthChecker):
|
||||
"""
|
||||
Checker that checks critical system service status via monit service.
|
||||
"""
|
||||
|
||||
# Command to query the status of monit service.
|
||||
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
||||
|
||||
# Command to get summary of critical system service.
|
||||
CHECK_CMD = 'monit summary -B'
|
||||
MIN_CHECK_CMD_LINES = 3
|
||||
|
||||
# Expect status for different system service category.
|
||||
EXPECT_STATUS_DICT = {
|
||||
'System': 'Running',
|
||||
'Process': 'Running',
|
||||
'Filesystem': 'Accessible',
|
||||
'Program': 'Status ok'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return 'Services'
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
||||
process and file system.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
||||
if output != 'active':
|
||||
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
||||
return
|
||||
|
||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||
lines = output.splitlines()
|
||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
status_begin = lines[1].find('Status')
|
||||
type_begin = lines[1].find('Type')
|
||||
if status_begin < 0 or type_begin < 0:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
for line in lines[2:]:
|
||||
name = line[0:status_begin].strip()
|
||||
if config.ignore_services and name in config.ignore_services:
|
||||
continue
|
||||
status = line[status_begin:type_begin].strip()
|
||||
service_type = line[type_begin:].strip()
|
||||
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
||||
continue
|
||||
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
||||
if expect_status != status:
|
||||
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
||||
else:
|
||||
self.set_object_ok(service_type, name)
|
||||
return
|
@ -0,0 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "amber",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
88
src/system-health/health_checker/user_defined_checker.py
Normal file
88
src/system-health/health_checker/user_defined_checker.py
Normal file
@ -0,0 +1,88 @@
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class UserDefinedChecker(HealthChecker):
|
||||
"""
|
||||
User could implement a script or program to perform customize check for particular system. In order to enable a
|
||||
user defined checker:
|
||||
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
|
||||
that can be executed by shell. For example: "python my_checker.py".
|
||||
2. The command output must match the following pattern:
|
||||
${UserDefineCategory}
|
||||
${Object1}:${ObjectStatusMessage1}
|
||||
${Object2}:${ObjectStatusMessage2}
|
||||
|
||||
An example of the command output:
|
||||
MyCategory
|
||||
Device1:OK
|
||||
Device2:OK
|
||||
Device3:Out of power
|
||||
"""
|
||||
def __init__(self, cmd):
|
||||
"""
|
||||
Constructor.
|
||||
:param cmd: Command string of the user defined checker.
|
||||
"""
|
||||
HealthChecker.__init__(self)
|
||||
self._cmd = cmd
|
||||
self._category = None
|
||||
|
||||
def reset(self):
|
||||
self._category = 'UserDefine'
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return self._category
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Execute the user defined command and parse the output.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
|
||||
output = utils.run_command(self._cmd)
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
output = output.strip()
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
raw_lines = output.splitlines()
|
||||
if not raw_lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
lines = []
|
||||
for line in raw_lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
lines.append(line)
|
||||
|
||||
if not lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
self._category = lines[0]
|
||||
if len(lines) > 1:
|
||||
for line in lines[1:]:
|
||||
pos = line.find(':')
|
||||
if pos == -1:
|
||||
continue
|
||||
obj_name = line[:pos].strip()
|
||||
msg = line[pos + 1:].strip()
|
||||
if msg != 'OK':
|
||||
self.set_object_not_ok('UserDefine', obj_name, msg)
|
||||
else:
|
||||
self.set_object_ok('UserDefine', obj_name)
|
||||
return
|
||||
|
||||
def __str__(self):
|
||||
return 'UserDefinedChecker - {}'.format(self._cmd)
|
25
src/system-health/health_checker/utils.py
Normal file
25
src/system-health/health_checker/utils.py
Normal file
@ -0,0 +1,25 @@
|
||||
import subprocess
|
||||
|
||||
|
||||
def run_command(command):
|
||||
"""
|
||||
Utility function to run an shell command and return the output.
|
||||
:param command: Shell command string.
|
||||
:return: Output of the shell command.
|
||||
"""
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
return process.communicate()[0].encode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_uptime():
|
||||
"""
|
||||
Utility to get the system up time.
|
||||
:return: System up time in seconds.
|
||||
"""
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
|
||||
return uptime_seconds
|
2
src/system-health/pytest.ini
Normal file
2
src/system-health/pytest.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml
|
108
src/system-health/scripts/healthd
Normal file
108
src/system-health/scripts/healthd
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
"""
|
||||
healthd
|
||||
System health monitor daemon for SONiC
|
||||
"""
|
||||
|
||||
import signal
|
||||
import threading
|
||||
|
||||
from sonic_py_common.daemon_base import DaemonBase
|
||||
from swsssdk import SonicV2Connector
|
||||
|
||||
from health_checker.manager import HealthCheckerManager
|
||||
|
||||
SYSLOG_IDENTIFIER = 'healthd'
|
||||
|
||||
|
||||
class HealthDaemon(DaemonBase):
|
||||
"""
|
||||
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
|
||||
according to the check result and store the check result to redis.
|
||||
"""
|
||||
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Constructor of HealthDaemon.
|
||||
"""
|
||||
DaemonBase.__init__(self, SYSLOG_IDENTIFIER)
|
||||
self._db = SonicV2Connector(host="127.0.0.1")
|
||||
self._db.connect(self._db.STATE_DB)
|
||||
self.stop_event = threading.Event()
|
||||
|
||||
def deinit(self):
|
||||
"""
|
||||
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
|
||||
:return:
|
||||
"""
|
||||
self._clear_system_health_table()
|
||||
|
||||
def _clear_system_health_table(self):
|
||||
self._db.delete_all_by_pattern(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME)
|
||||
|
||||
# Signal handler
|
||||
def signal_handler(self, sig, frame):
|
||||
"""
|
||||
Signal handler
|
||||
:param sig: Signal number
|
||||
:param frame: not used
|
||||
:return:
|
||||
"""
|
||||
if sig == signal.SIGHUP:
|
||||
self.log_notice("Caught SIGHUP - ignoring...")
|
||||
elif sig == signal.SIGINT:
|
||||
self.log_notice("Caught SIGINT - exiting...")
|
||||
self.stop_event.set()
|
||||
elif sig == signal.SIGTERM:
|
||||
self.log_notice("Caught SIGTERM - exiting...")
|
||||
self.stop_event.set()
|
||||
else:
|
||||
self.log_warning("Caught unhandled signal '" + sig + "'")
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Check system health in an infinite loop.
|
||||
:return:
|
||||
"""
|
||||
self.log_notice("Starting up...")
|
||||
|
||||
import sonic_platform.platform
|
||||
chassis = sonic_platform.platform.Platform().get_chassis()
|
||||
manager = HealthCheckerManager()
|
||||
if not manager.config.config_file_exists():
|
||||
self.log_warning("System health configuration file not found, exit...")
|
||||
return
|
||||
while 1:
|
||||
state, stat = manager.check(chassis)
|
||||
if state == HealthCheckerManager.STATE_RUNNING:
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
|
||||
if self.stop_event.wait(manager.config.interval):
|
||||
break
|
||||
|
||||
self.deinit()
|
||||
|
||||
def _process_stat(self, chassis, config, stat):
|
||||
from health_checker.health_checker import HealthChecker
|
||||
self._clear_system_health_table()
|
||||
for category, info in stat.items():
|
||||
for obj_name, obj_data in info.items():
|
||||
if obj_data[HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK:
|
||||
self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, obj_name,
|
||||
obj_data[HealthChecker.INFO_FIELD_OBJECT_MSG])
|
||||
|
||||
self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, 'summary', HealthChecker.summary)
|
||||
|
||||
|
||||
#
|
||||
# Main =========================================================================
|
||||
#
|
||||
def main():
|
||||
health_monitor = HealthDaemon()
|
||||
health_monitor.run()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
2
src/system-health/setup.cfg
Normal file
2
src/system-health/setup.cfg
Normal file
@ -0,0 +1,2 @@
|
||||
[aliases]
|
||||
test=pytest
|
49
src/system-health/setup.py
Normal file
49
src/system-health/setup.py
Normal file
@ -0,0 +1,49 @@
|
||||
from setuptools import setup
|
||||
|
||||
dependencies = [
|
||||
'natsort',
|
||||
'sonic_py_common',
|
||||
'swsssdk>=2.0.1',
|
||||
]
|
||||
|
||||
setup(
|
||||
name='system-health',
|
||||
version='1.0',
|
||||
description='SONiC system health package',
|
||||
license='Apache 2.0',
|
||||
author='SONiC Team',
|
||||
author_email='linuxnetdev@microsoft.com',
|
||||
url='https://github.com/Azure/sonic-buildimage',
|
||||
maintainer='Junchao Chen',
|
||||
maintainer_email='junchaow@mellanox.com',
|
||||
install_requires=dependencies,
|
||||
packages=[
|
||||
'health_checker',
|
||||
'tests'
|
||||
],
|
||||
scripts=[
|
||||
'scripts/healthd',
|
||||
],
|
||||
setup_requires= [
|
||||
'pytest-runner'
|
||||
],
|
||||
tests_require = [
|
||||
'pytest',
|
||||
'mock>=2.0.0'
|
||||
],
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Environment :: No Input/Output (Daemon)',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Information Technology',
|
||||
'Intended Audience :: System Administrators',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Natural Language :: English',
|
||||
'Operating System :: POSIX :: Linux',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Topic :: System :: Hardware',
|
||||
],
|
||||
keywords='SONiC sonic HEALTH health',
|
||||
test_suite='setup.get_test_suite'
|
||||
)
|
||||
|
0
src/system-health/tests/__init__.py
Normal file
0
src/system-health/tests/__init__.py
Normal file
25
src/system-health/tests/mock_connector.py
Normal file
25
src/system-health/tests/mock_connector.py
Normal file
@ -0,0 +1,25 @@
|
||||
class MockConnector(object):
|
||||
STATE_DB = None
|
||||
data = {}
|
||||
|
||||
def __init__(self, host):
|
||||
pass
|
||||
|
||||
def connect(self, db_id):
|
||||
pass
|
||||
|
||||
def get(self, db_id, key, field):
|
||||
return MockConnector.data[key][field]
|
||||
|
||||
def keys(self, db_id, pattern):
|
||||
match = pattern.split('*')[0]
|
||||
ret = []
|
||||
for key in MockConnector.data.keys():
|
||||
if match in key:
|
||||
ret.append(key)
|
||||
|
||||
return ret
|
||||
|
||||
def get_all(self, db_id, key):
|
||||
return MockConnector.data[key]
|
||||
|
219
src/system-health/tests/test_system_health.py
Normal file
219
src/system-health/tests/test_system_health.py
Normal file
@ -0,0 +1,219 @@
|
||||
"""
|
||||
Unit test cases for system health checker. The current test case contains:
|
||||
1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker
|
||||
2. test_service_checker mocks the output of monit service and verify class ServiceChecker
|
||||
3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker
|
||||
And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test.
|
||||
1. HealthDaemon
|
||||
2. HealthCheckerManager
|
||||
3. Config
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import swsssdk
|
||||
|
||||
from mock import Mock, MagicMock, patch
|
||||
from sonic_py_common import device_info
|
||||
|
||||
from .mock_connector import MockConnector
|
||||
|
||||
swsssdk.SonicV2Connector = MockConnector
|
||||
|
||||
test_path = os.path.dirname(os.path.abspath(__file__))
|
||||
modules_path = os.path.dirname(test_path)
|
||||
sys.path.insert(0, modules_path)
|
||||
from health_checker import utils
|
||||
from health_checker.config import Config
|
||||
from health_checker.hardware_checker import HardwareChecker
|
||||
from health_checker.health_checker import HealthChecker
|
||||
from health_checker.manager import HealthCheckerManager
|
||||
from health_checker.service_checker import ServiceChecker
|
||||
from health_checker.user_defined_checker import UserDefinedChecker
|
||||
|
||||
device_info.get_platform = MagicMock(return_value='unittest')
|
||||
|
||||
|
||||
def test_user_defined_checker():
|
||||
utils.run_command = MagicMock(return_value='')
|
||||
|
||||
checker = UserDefinedChecker('')
|
||||
checker.check(None)
|
||||
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
checker.reset()
|
||||
assert len(checker._info) == 0
|
||||
|
||||
utils.run_command = MagicMock(return_value='\n\n\n')
|
||||
checker.check(None)
|
||||
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n'
|
||||
utils.run_command = MagicMock(return_value=valid_output)
|
||||
checker.check(None)
|
||||
assert 'Device1' in checker._info
|
||||
assert 'Device2' in checker._info
|
||||
assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
|
||||
def test_service_checker():
|
||||
return_value = ''
|
||||
|
||||
def mock_run_command(cmd):
|
||||
if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD:
|
||||
return 'active'
|
||||
else:
|
||||
return return_value
|
||||
|
||||
utils.run_command = mock_run_command
|
||||
return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \
|
||||
'Service Name Status Type\n' \
|
||||
'sonic Running System\n' \
|
||||
'sonic1 Not running System\n' \
|
||||
'telemetry Does not exist Process\n' \
|
||||
'orchagent Running Process\n' \
|
||||
'root-overlay Accessible Filesystem\n' \
|
||||
'var-log Is not accessible Filesystem\n'
|
||||
|
||||
checker = ServiceChecker()
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
assert 'sonic' in checker._info
|
||||
assert checker._info['sonic'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'sonic1' in checker._info
|
||||
assert checker._info['sonic1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'orchagent' in checker._info
|
||||
assert checker._info['orchagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'telemetry' in checker._info
|
||||
assert checker._info['telemetry'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'root-overlay' in checker._info
|
||||
assert checker._info['root-overlay'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'var-log' in checker._info
|
||||
assert checker._info['var-log'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
|
||||
def test_hardware_checker():
|
||||
MockConnector.data.update({
|
||||
'TEMPERATURE_INFO|ASIC': {
|
||||
'temperature': '20',
|
||||
'high_threshold': '21'
|
||||
}
|
||||
})
|
||||
|
||||
MockConnector.data.update({
|
||||
'FAN_INFO|fan1': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
},
|
||||
'FAN_INFO|fan2': {
|
||||
'presence': 'False',
|
||||
'status': 'True',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
},
|
||||
'FAN_INFO|fan3': {
|
||||
'presence': 'True',
|
||||
'status': 'False',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
},
|
||||
'FAN_INFO|fan4': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'speed': '20',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
}
|
||||
})
|
||||
|
||||
MockConnector.data.update({
|
||||
'PSU_INFO|PSU 1': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'temp': '55',
|
||||
'temp_threshold': '100',
|
||||
'voltage': '10',
|
||||
'voltage_min_threshold': '8',
|
||||
'voltage_max_threshold': '15',
|
||||
},
|
||||
'PSU_INFO|PSU 2': {
|
||||
'presence': 'False',
|
||||
'status': 'True',
|
||||
'temp': '55',
|
||||
'temp_threshold': '100',
|
||||
'voltage': '10',
|
||||
'voltage_min_threshold': '8',
|
||||
'voltage_max_threshold': '15',
|
||||
},
|
||||
'PSU_INFO|PSU 3': {
|
||||
'presence': 'True',
|
||||
'status': 'False',
|
||||
'temp': '55',
|
||||
'temp_threshold': '100',
|
||||
'voltage': '10',
|
||||
'voltage_min_threshold': '8',
|
||||
'voltage_max_threshold': '15',
|
||||
},
|
||||
'PSU_INFO|PSU 4': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'temp': '101',
|
||||
'temp_threshold': '100',
|
||||
'voltage': '10',
|
||||
'voltage_min_threshold': '8',
|
||||
'voltage_max_threshold': '15',
|
||||
},
|
||||
'PSU_INFO|PSU 5': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'temp': '55',
|
||||
'temp_threshold': '100',
|
||||
'voltage': '10',
|
||||
'voltage_min_threshold': '12',
|
||||
'voltage_max_threshold': '15',
|
||||
}
|
||||
})
|
||||
|
||||
checker = HardwareChecker()
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
|
||||
assert 'ASIC' in checker._info
|
||||
assert checker._info['ASIC'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'fan1' in checker._info
|
||||
assert checker._info['fan1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'fan2' in checker._info
|
||||
assert checker._info['fan2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'fan3' in checker._info
|
||||
assert checker._info['fan3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'fan4' in checker._info
|
||||
assert checker._info['fan4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'PSU 1' in checker._info
|
||||
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
||||
assert 'PSU 2' in checker._info
|
||||
assert checker._info['PSU 2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'PSU 3' in checker._info
|
||||
assert checker._info['PSU 3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'PSU 4' in checker._info
|
||||
assert checker._info['PSU 4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
|
||||
assert 'PSU 5' in checker._info
|
||||
assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
Loading…
Reference in New Issue
Block a user