[system-health] Add support for monitoring system health (#4835)

* system health first commit

* system health daemon first commit

* Finish healthd

* Changes due to lower layer logic change

* Get ASIC temperature from TEMPERATURE_INFO table

* Add system health make rule and service files

* fix bugs found during manual test

* Change make file to install system-health library to host

* Set system LED to blink on bootup time

* Caught exceptions in system health checker to make it more robust

* fix issue that fan/psu presence will always be true

* fix issue for external checker

* move system-health service to right after rc-local service

* Set system-health service start after database service

* Get system up time via /proc/uptime

* Provide more information in stat for CLI to use

* fix typo

* Set default category to External for external checker

* If external checker reported OK, save it to stat too

* Trim string for external checker output

* fix issue: PSU voltage check always return OK

* Add unit test cases for system health library

* Fix LGTM warnings

* fix demo comments: 1. get boot up timeout from monit configuration file; 2. set system led in library instead of daemon

* Remove boot_timeout configuration because it will get from monit config file

* Fix argument miss

* fix unit test failure

* fix issue: summary status is not correct

* Fix format issues found in code review

* rename th to threshold to make it clearer

* Fix review comment: 1. add a .dep file for system health; 2. deprecated daemon_base and uses sonic-py-common instead

* Fix unit test failure

* Fix LGTM alert

* Fix LGTM alert

* Fix review comments

* Fix review comment

* 1. Add relevant comments for system health; 2. rename external_checker to user_define_checker

* Ignore check for unknown service type

* Fix unit test issue

* Rename user define checker to user defined checker

* Rename user_define_checkers to user_defined_checkers for configuration file

* Renmae file user_define_checker.py -> user_defined_checker.py

* Fix typo

* Adjust import order for config.py

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import order for src/system-health/health_checker/hardware_checker.py

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import order for src/system-health/scripts/healthd

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import orders in src/system-health/tests/test_system_health.py

* Fix typo

* Add new line after import

* If system health configuration file not exist, healthd should exit

* Fix indent and enable pytest coverage

* Fix typo

* Fix typo

* Remove global logger and use log functions inherited from super class

* Change info level logger to notice level

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>
This commit is contained in:
Junchao-Mellanox 2020-10-12 16:12:49 +08:00 committed by GitHub
parent 8e0e316cf8
commit 1c97a03b81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 1242 additions and 13 deletions

View File

@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu.voltage", "psu.temperature"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}

View File

@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu.voltage"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}

View File

@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu","asic","fan"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}

View File

@ -172,6 +172,12 @@ sudo cp {{platform_common_py2_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $PLATFORM_COMMON_PY2_WHEEL_NAME
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2_WHEEL_NAME
# Install system-health Python 2 package
SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}})
sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $SYSTEM_HEALTH_PY2_WHEEL_NAME
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
# Install sonic-platform-common Python 3 package
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
@ -283,6 +289,10 @@ sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d
sudo cp $IMAGE_CONFIGS/syslog/override.conf $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d/override.conf
sudo cp $IMAGE_CONFIGS/syslog/host_umount.sh $FILESYSTEM_ROOT/usr/bin/
# Copy system-health files
sudo LANG=C cp $IMAGE_CONFIGS/system-health/system-health.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
echo "system-health.service" | sudo tee -a $GENERATED_SERVICE_FILE
# Copy logrotate.d configuration files
sudo cp -f $IMAGE_CONFIGS/logrotate/logrotate.d/* $FILESYSTEM_ROOT/etc/logrotate.d/

View File

@ -0,0 +1,11 @@
[Unit]
Description=SONiC system health monitor
Requires=database.service updategraph.service
After=database.service updategraph.service
[Service]
ExecStart=/usr/local/bin/healthd
Restart=always
[Install]
WantedBy=multi-user.target

8
rules/system-health.dep Normal file
View File

@ -0,0 +1,8 @@
SPATH := $($(SYSTEM_HEALTH)_SRC_PATH)
DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/system-health.mk rules/system-health.dep
DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST)
DEP_FILES += $(shell git ls-files $(SPATH))
$(SYSTEM_HEALTH)_CACHE_MODE := GIT_CONTENT_SHA
$(SYSTEM_HEALTH)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST)
$(SYSTEM_HEALTH)_DEP_FILES := $(DEP_FILES)

9
rules/system-health.mk Normal file
View File

@ -0,0 +1,9 @@
# system health python2 wheel
SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE)
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"

View File

@ -819,7 +819,8 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(REDIS_DUMP_LOAD_PY2)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY))
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))
$(HEADER)
# Pass initramfs and linux kernel explicitly. They are used for all platforms
export debs_path="$(IMAGE_DISTRO_DEBS_PATH)"

8
src/system-health/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*/deb_dist/
*/dist/
*/build/
*/*.tar.gz
*/*.egg-info
*/.cache/
*.pyc
*/__pycache__/

View File

@ -0,0 +1,2 @@
from . import hardware_checker
from . import service_checker

View File

@ -0,0 +1,144 @@
import json
import os
from sonic_py_common import device_info
class Config(object):
"""
Manage configuration of system health.
"""
# Default system health check interval
DEFAULT_INTERVAL = 60
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
DEFAULT_BOOTUP_TIMEOUT = 300
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
# override the default behavior.
DEFAULT_LED_CONFIG = {
'fault': 'red',
'normal': 'green',
'booting': 'orange_blink'
}
# System health configuration file name
CONFIG_FILE = 'system_health_monitoring_config.json'
# Monit service configuration file path
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
# Monit service start delay configuration entry
MONIT_START_DELAY_CONFIG = 'with start delay'
def __init__(self):
"""
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
"""
self.platform_name = device_info.get_platform()
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
self._last_mtime = None
self.config_data = None
self.interval = Config.DEFAULT_INTERVAL
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None
def config_file_exists(self):
return os.path.exists(self._config_file)
def load_config(self):
"""
Load the configuration file from disk.
1. If there is no configuration file, current config entries will reset to default value
2. Only read the configuration file is last_mtime changes for better performance
3. If there is any format issues in configuration file, current config entries will reset to default value
:return:
"""
if not self.config_file_exists():
if self._last_mtime is not None:
self._reset()
return
mtime = os.stat(self._config_file)
if mtime != self._last_mtime:
try:
self._last_mtime = mtime
with open(self._config_file, 'r') as f:
self.config_data = json.load(f)
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
self.ignore_services = self._get_list_data('services_to_ignore')
self.ignore_devices = self._get_list_data('devices_to_ignore')
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
except Exception as e:
self._reset()
def _reset(self):
"""
Reset current configuration entry to default value
:return:
"""
self._last_mtime = None
self.config_data = None
self.interval = Config.DEFAULT_INTERVAL
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None
def get_led_color(self, status):
"""
Get desired LED color according to the input status
:param status: System health status
:return: StringLED color
"""
if self.config_data and 'led_color' in self.config_data:
if status in self.config_data['led_color']:
return self.config_data['led_color'][status]
return self.DEFAULT_LED_CONFIG[status]
def get_bootup_timeout(self):
"""
Get boot up timeout from monit configuration file.
1. If monit configuration file does not exist, return default value
2. If there is any exception while parsing monit config, return default value
:return: Integer timeout value
"""
if not os.path.exists(Config.MONIT_CONFIG_FILE):
return self.DEFAULT_BOOTUP_TIMEOUT
try:
with open(Config.MONIT_CONFIG_FILE) as f:
lines = f.readlines()
for line in lines:
if not line:
continue
line = line.strip()
if not line:
continue
pos = line.find('#')
if pos == 0:
continue
line = line[:pos]
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
if pos != -1:
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
except Exception:
return self.DEFAULT_BOOTUP_TIMEOUT
def _get_list_data(self, key):
"""
Get list type configuration data by key and remove duplicate element.
:param key: Key of the configuration entry
:return: A set of configuration data if key exists
"""
if key in self.config_data:
data = self.config_data[key]
if isinstance(data, list):
return set(data)
return None

View File

@ -0,0 +1,248 @@
from natsort import natsorted
from swsssdk import SonicV2Connector
from .health_checker import HealthChecker
class HardwareChecker(HealthChecker):
"""
Check system hardware status. For now, it checks ASIC, PSU and fan status.
"""
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
FAN_TABLE_NAME = 'FAN_INFO'
PSU_TABLE_NAME = 'PSU_INFO'
def __init__(self):
HealthChecker.__init__(self)
self._db = SonicV2Connector(host="127.0.0.1")
self._db.connect(self._db.STATE_DB)
def get_category(self):
return 'Hardware'
def check(self, config):
self.reset()
self._check_asic_status(config)
self._check_fan_status(config)
self._check_psu_status(config)
def _check_asic_status(self, config):
"""
Check if ASIC temperature is in valid range.
:param config: Health checker configuration
:return:
"""
if config.ignore_devices and 'asic' in config.ignore_devices:
return
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
if not temperature:
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
elif not temperature_threshold:
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
else:
try:
temperature = float(temperature)
temperature_threshold = float(temperature_threshold)
if temperature > temperature_threshold:
self.set_object_not_ok('ASIC', 'ASIC',
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
temperature,
temperature_threshold))
else:
self.set_object_ok('ASIC', 'ASIC')
except ValueError as e:
self.set_object_not_ok('ASIC', 'ASIC',
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
temperature_threshold))
def _check_fan_status(self, config):
"""
Check fan status including:
1. Check all fans are present
2. Check all fans are in good state
3. Check fan speed is in valid range
:param config: Health checker configuration
:return:
"""
if config.ignore_devices and 'fan' in config.ignore_devices:
return
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
if not keys:
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
return
for key in natsorted(keys):
key_list = key.split('|')
if len(key_list) != 2: # error data in DB, log it and ignore
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
continue
name = key_list[1]
if config.ignore_devices and name in config.ignore_devices:
continue
data_dict = self._db.get_all(self._db.STATE_DB, key)
presence = data_dict.get('presence', 'false')
if presence.lower() != 'true':
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
continue
status = data_dict.get('status', 'false')
if status.lower() != 'true':
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
continue
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
speed = data_dict.get('speed', None)
speed_target = data_dict.get('speed_target', None)
speed_tolerance = data_dict.get('speed_tolerance', None)
if not speed:
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
continue
elif not speed_target:
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
continue
elif not speed_tolerance:
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
continue
else:
try:
speed = float(speed)
speed_target = float(speed_target)
speed_tolerance = float(speed_tolerance)
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
if speed < speed_min_th or speed > speed_max_th:
self.set_object_not_ok('Fan', name,
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
speed,
speed_min_th,
speed_max_th))
continue
except ValueError:
self.set_object_not_ok('Fan', name,
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
name,
speed,
speed_target,
speed_tolerance))
continue
self.set_object_ok('Fan', name)
def _check_psu_status(self, config):
"""
Check PSU status including:
1. Check all PSUs are present
2. Check all PSUs are power on
3. Check PSU temperature is in valid range
4. Check PSU voltage is in valid range
:param config: Health checker configuration
:return:
"""
if config.ignore_devices and 'psu' in config.ignore_devices:
return
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
if not keys:
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
return
for key in natsorted(keys):
key_list = key.split('|')
if len(key_list) != 2: # error data in DB, log it and ignore
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
continue
name = key_list[1]
if config.ignore_devices and name in config.ignore_devices:
continue
data_dict = self._db.get_all(self._db.STATE_DB, key)
presence = data_dict.get('presence', 'false')
if presence.lower() != 'true':
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
continue
status = data_dict.get('status', 'false')
if status.lower() != 'true':
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
continue
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
temperature = data_dict.get('temp', None)
temperature_threshold = data_dict.get('temp_threshold', None)
if temperature is None:
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
continue
elif temperature_threshold is None:
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
continue
else:
try:
temperature = float(temperature)
temperature_threshold = float(temperature_threshold)
if temperature > temperature_threshold:
self.set_object_not_ok('PSU', name,
'{} temperature is too hot, temperature={}, threshold={}'.format(
name, temperature,
temperature_threshold))
continue
except ValueError:
self.set_object_not_ok('PSU', name,
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
name, temperature,
temperature_threshold))
continue
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
voltage = data_dict.get('voltage', None)
voltage_min_th = data_dict.get('voltage_min_threshold', None)
voltage_max_th = data_dict.get('voltage_max_threshold', None)
if voltage is None:
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
continue
elif voltage_min_th is None:
self.set_object_not_ok('PSU', name,
'Failed to get voltage minimum threshold data for {}'.format(name))
continue
elif voltage_max_th is None:
self.set_object_not_ok('PSU', name,
'Failed to get voltage maximum threshold data for {}'.format(name))
continue
else:
try:
voltage = float(voltage)
voltage_min_th = float(voltage_min_th)
voltage_max_th = float(voltage_max_th)
if voltage < voltage_min_th or voltage > voltage_max_th:
self.set_object_not_ok('PSU', name,
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
voltage,
voltage_min_th,
voltage_max_th))
continue
except ValueError:
self.set_object_not_ok('PSU', name,
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
voltage,
voltage_min_th,
voltage_max_th))
continue
self.set_object_ok('PSU', name)
def reset(self):
self._info = {}
@classmethod
def _ignore_check(cls, ignore_set, category, object_name, check_point):
if not ignore_set:
return False
if '{}.{}'.format(category, check_point) in ignore_set:
return True
elif '{}.{}'.format(object_name, check_point) in ignore_set:
return True
return False

View File

@ -0,0 +1,86 @@
class HealthChecker(object):
"""
Base class for health checker. A checker is an object that performs system health check for a particular category,
it collects and stores information after the check.
"""
INFO_FIELD_OBJECT_TYPE = 'type'
INFO_FIELD_OBJECT_STATUS = 'status'
INFO_FIELD_OBJECT_MSG = 'message'
STATUS_OK = 'OK'
STATUS_NOT_OK = 'Not OK'
summary = STATUS_OK
def __init__(self):
self._info = {}
def reset(self):
"""
Reset the status of the checker. Called every time before the check.
:return:
"""
pass
def get_category(self):
"""
Get category of the checker.
:return: String category
"""
pass
def get_info(self):
"""
Get information of the checker. A checker usually checks a few objects and each object status will be put to
self._info.
:return: Check result.
"""
return self._info
def check(self, config):
"""
Perform the check.
:param config: Health checker configuration.
:return:
"""
pass
def __str__(self):
return self.__class__.__name__
def add_info(self, object_name, key, value):
"""
Add check result for an object.
:param object_name: Object name.
:param key: Object attribute name.
:param value: Object attribute value.
:return:
"""
if object_name not in self._info:
self._info[object_name] = {}
self._info[object_name][key] = value
def set_object_not_ok(self, object_type, object_name, message):
"""
Set that an object is not OK.
:param object_type: Object type.
:param object_name: Object name.
:param message: A message to describe what is wrong with the object.
:return:
"""
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
def set_object_ok(self, object_type, object_name):
"""
Set that an object is in good state.
:param object_type: Object type.
:param object_name: Object name.
:return:
"""
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)

View File

@ -0,0 +1,101 @@
class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
"""
STATE_BOOTING = 'booting'
STATE_RUNNING = 'running'
boot_timeout = None
def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING
from .config import Config
self.config = Config()
self.initialize()
def initialize(self):
"""
Initialize the manager. Create service checker and hardware checker by default.
:return:
"""
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
self._checkers.append(ServiceChecker())
self._checkers.append(HardwareChecker())
def check(self, chassis):
"""
Load new configuration if any and perform the system health check for all existing checkers.
:param chassis: A chassis object.
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
"""
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
# check state first to avoid user change boot timeout in configuration file
# after finishing system boot
if self._state == self.STATE_BOOTING and self._is_system_booting():
self._set_system_led(chassis, self.config, 'booting')
return self._state, stats
for checker in self._checkers:
self._do_check(checker, stats)
if self.config.user_defined_checkers:
from .user_defined_checker import UserDefinedChecker
for udc in self.config.user_defined_checkers:
checker = UserDefinedChecker(udc)
self._do_check(checker, stats)
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
self._set_system_led(chassis, self.config, led_status)
return self._state, stats
def _do_check(self, checker, stats):
"""
Do check for a particular checker and collect the check statistic.
:param checker: A checker object.
:param stats: Check statistic.
:return:
"""
try:
checker.check(self.config)
category = checker.get_category()
info = checker.get_info()
if category not in stats:
stats[category] = info
else:
stats[category].update(info)
except Exception as e:
from .health_checker import HealthChecker
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
entry = {str(checker): {
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
}}
if 'Internal' not in stats:
stats['Internal'] = entry
else:
stats['Internal'].update(entry)
def _is_system_booting(self):
from .utils import get_uptime
uptime = get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
if not booting:
self._state = self.STATE_RUNNING
return booting
def _set_system_led(self, chassis, config, status):
try:
chassis.set_status_led(config.get_led_color(status))
except NotImplementedError:
print('chassis.set_status_led is not implemented')
except Exception as e:
print('Failed to set system led due to - {}'.format(repr(e)))

View File

@ -0,0 +1,72 @@
from .health_checker import HealthChecker
from . import utils
class ServiceChecker(HealthChecker):
"""
Checker that checks critical system service status via monit service.
"""
# Command to query the status of monit service.
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
# Command to get summary of critical system service.
CHECK_CMD = 'monit summary -B'
MIN_CHECK_CMD_LINES = 3
# Expect status for different system service category.
EXPECT_STATUS_DICT = {
'System': 'Running',
'Process': 'Running',
'Filesystem': 'Accessible',
'Program': 'Status ok'
}
def __init__(self):
HealthChecker.__init__(self)
def reset(self):
self._info = {}
def get_category(self):
return 'Services'
def check(self, config):
"""
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
process and file system.
:param config: Health checker configuration.
:return:
"""
self.reset()
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
if output != 'active':
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
return
output = utils.run_command(ServiceChecker.CHECK_CMD)
lines = output.splitlines()
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
return
status_begin = lines[1].find('Status')
type_begin = lines[1].find('Type')
if status_begin < 0 or type_begin < 0:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
return
for line in lines[2:]:
name = line[0:status_begin].strip()
if config.ignore_services and name in config.ignore_services:
continue
status = line[status_begin:type_begin].strip()
service_type = line[type_begin:].strip()
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
continue
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
if expect_status != status:
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
else:
self.set_object_ok(service_type, name)
return

View File

@ -0,0 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "amber",
"normal": "green",
"booting": "orange_blink"
}
}

View File

@ -0,0 +1,88 @@
from .health_checker import HealthChecker
from . import utils
class UserDefinedChecker(HealthChecker):
"""
User could implement a script or program to perform customize check for particular system. In order to enable a
user defined checker:
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
that can be executed by shell. For example: "python my_checker.py".
2. The command output must match the following pattern:
${UserDefineCategory}
${Object1}:${ObjectStatusMessage1}
${Object2}:${ObjectStatusMessage2}
An example of the command output:
MyCategory
Device1:OK
Device2:OK
Device3:Out of power
"""
def __init__(self, cmd):
"""
Constructor.
:param cmd: Command string of the user defined checker.
"""
HealthChecker.__init__(self)
self._cmd = cmd
self._category = None
def reset(self):
self._category = 'UserDefine'
self._info = {}
def get_category(self):
return self._category
def check(self, config):
"""
Execute the user defined command and parse the output.
:param config: Health checker configuration.
:return:
"""
self.reset()
output = utils.run_command(self._cmd)
if not output:
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
return
output = output.strip()
if not output:
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
return
raw_lines = output.splitlines()
if not raw_lines:
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
return
lines = []
for line in raw_lines:
line = line.strip()
if not line:
continue
lines.append(line)
if not lines:
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
return
self._category = lines[0]
if len(lines) > 1:
for line in lines[1:]:
pos = line.find(':')
if pos == -1:
continue
obj_name = line[:pos].strip()
msg = line[pos + 1:].strip()
if msg != 'OK':
self.set_object_not_ok('UserDefine', obj_name, msg)
else:
self.set_object_ok('UserDefine', obj_name)
return
def __str__(self):
return 'UserDefinedChecker - {}'.format(self._cmd)

View File

@ -0,0 +1,25 @@
import subprocess
def run_command(command):
"""
Utility function to run an shell command and return the output.
:param command: Shell command string.
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
return process.communicate()[0].encode('utf-8')
except Exception:
return None
def get_uptime():
"""
Utility to get the system up time.
:return: System up time in seconds.
"""
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
return uptime_seconds

View File

@ -0,0 +1,2 @@
[pytest]
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml

View File

@ -0,0 +1,108 @@
#!/usr/bin/env python2
"""
healthd
System health monitor daemon for SONiC
"""
import signal
import threading
from sonic_py_common.daemon_base import DaemonBase
from swsssdk import SonicV2Connector
from health_checker.manager import HealthCheckerManager
SYSLOG_IDENTIFIER = 'healthd'
class HealthDaemon(DaemonBase):
"""
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
according to the check result and store the check result to redis.
"""
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
def __init__(self):
"""
Constructor of HealthDaemon.
"""
DaemonBase.__init__(self, SYSLOG_IDENTIFIER)
self._db = SonicV2Connector(host="127.0.0.1")
self._db.connect(self._db.STATE_DB)
self.stop_event = threading.Event()
def deinit(self):
"""
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
:return:
"""
self._clear_system_health_table()
def _clear_system_health_table(self):
self._db.delete_all_by_pattern(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME)
# Signal handler
def signal_handler(self, sig, frame):
"""
Signal handler
:param sig: Signal number
:param frame: not used
:return:
"""
if sig == signal.SIGHUP:
self.log_notice("Caught SIGHUP - ignoring...")
elif sig == signal.SIGINT:
self.log_notice("Caught SIGINT - exiting...")
self.stop_event.set()
elif sig == signal.SIGTERM:
self.log_notice("Caught SIGTERM - exiting...")
self.stop_event.set()
else:
self.log_warning("Caught unhandled signal '" + sig + "'")
def run(self):
"""
Check system health in an infinite loop.
:return:
"""
self.log_notice("Starting up...")
import sonic_platform.platform
chassis = sonic_platform.platform.Platform().get_chassis()
manager = HealthCheckerManager()
if not manager.config.config_file_exists():
self.log_warning("System health configuration file not found, exit...")
return
while 1:
state, stat = manager.check(chassis)
if state == HealthCheckerManager.STATE_RUNNING:
self._process_stat(chassis, manager.config, stat)
if self.stop_event.wait(manager.config.interval):
break
self.deinit()
def _process_stat(self, chassis, config, stat):
from health_checker.health_checker import HealthChecker
self._clear_system_health_table()
for category, info in stat.items():
for obj_name, obj_data in info.items():
if obj_data[HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK:
self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, obj_name,
obj_data[HealthChecker.INFO_FIELD_OBJECT_MSG])
self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, 'summary', HealthChecker.summary)
#
# Main =========================================================================
#
def main():
health_monitor = HealthDaemon()
health_monitor.run()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,2 @@
[aliases]
test=pytest

View File

@ -0,0 +1,49 @@
from setuptools import setup
dependencies = [
'natsort',
'sonic_py_common',
'swsssdk>=2.0.1',
]
setup(
name='system-health',
version='1.0',
description='SONiC system health package',
license='Apache 2.0',
author='SONiC Team',
author_email='linuxnetdev@microsoft.com',
url='https://github.com/Azure/sonic-buildimage',
maintainer='Junchao Chen',
maintainer_email='junchaow@mellanox.com',
install_requires=dependencies,
packages=[
'health_checker',
'tests'
],
scripts=[
'scripts/healthd',
],
setup_requires= [
'pytest-runner'
],
tests_require = [
'pytest',
'mock>=2.0.0'
],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: No Input/Output (Daemon)',
'Intended Audience :: Developers',
'Intended Audience :: Information Technology',
'Intended Audience :: System Administrators',
'License :: OSI Approved :: Apache Software License',
'Natural Language :: English',
'Operating System :: POSIX :: Linux',
'Programming Language :: Python :: 2.7',
'Topic :: System :: Hardware',
],
keywords='SONiC sonic HEALTH health',
test_suite='setup.get_test_suite'
)

View File

View File

@ -0,0 +1,25 @@
class MockConnector(object):
STATE_DB = None
data = {}
def __init__(self, host):
pass
def connect(self, db_id):
pass
def get(self, db_id, key, field):
return MockConnector.data[key][field]
def keys(self, db_id, pattern):
match = pattern.split('*')[0]
ret = []
for key in MockConnector.data.keys():
if match in key:
ret.append(key)
return ret
def get_all(self, db_id, key):
return MockConnector.data[key]

View File

@ -0,0 +1,219 @@
"""
Unit test cases for system health checker. The current test case contains:
1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker
2. test_service_checker mocks the output of monit service and verify class ServiceChecker
3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker
And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test.
1. HealthDaemon
2. HealthCheckerManager
3. Config
"""
import os
import sys
import swsssdk
from mock import Mock, MagicMock, patch
from sonic_py_common import device_info
from .mock_connector import MockConnector
swsssdk.SonicV2Connector = MockConnector
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)
from health_checker import utils
from health_checker.config import Config
from health_checker.hardware_checker import HardwareChecker
from health_checker.health_checker import HealthChecker
from health_checker.manager import HealthCheckerManager
from health_checker.service_checker import ServiceChecker
from health_checker.user_defined_checker import UserDefinedChecker
device_info.get_platform = MagicMock(return_value='unittest')
def test_user_defined_checker():
utils.run_command = MagicMock(return_value='')
checker = UserDefinedChecker('')
checker.check(None)
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
checker.reset()
assert len(checker._info) == 0
utils.run_command = MagicMock(return_value='\n\n\n')
checker.check(None)
assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n'
utils.run_command = MagicMock(return_value=valid_output)
checker.check(None)
assert 'Device1' in checker._info
assert 'Device2' in checker._info
assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
def test_service_checker():
return_value = ''
def mock_run_command(cmd):
if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD:
return 'active'
else:
return return_value
utils.run_command = mock_run_command
return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \
'Service Name Status Type\n' \
'sonic Running System\n' \
'sonic1 Not running System\n' \
'telemetry Does not exist Process\n' \
'orchagent Running Process\n' \
'root-overlay Accessible Filesystem\n' \
'var-log Is not accessible Filesystem\n'
checker = ServiceChecker()
config = Config()
checker.check(config)
assert 'sonic' in checker._info
assert checker._info['sonic'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'sonic1' in checker._info
assert checker._info['sonic1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'orchagent' in checker._info
assert checker._info['orchagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'telemetry' in checker._info
assert checker._info['telemetry'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'root-overlay' in checker._info
assert checker._info['root-overlay'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'var-log' in checker._info
assert checker._info['var-log'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
def test_hardware_checker():
MockConnector.data.update({
'TEMPERATURE_INFO|ASIC': {
'temperature': '20',
'high_threshold': '21'
}
})
MockConnector.data.update({
'FAN_INFO|fan1': {
'presence': 'True',
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
},
'FAN_INFO|fan2': {
'presence': 'False',
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
},
'FAN_INFO|fan3': {
'presence': 'True',
'status': 'False',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
},
'FAN_INFO|fan4': {
'presence': 'True',
'status': 'True',
'speed': '20',
'speed_target': '60',
'speed_tolerance': '20'
}
})
MockConnector.data.update({
'PSU_INFO|PSU 1': {
'presence': 'True',
'status': 'True',
'temp': '55',
'temp_threshold': '100',
'voltage': '10',
'voltage_min_threshold': '8',
'voltage_max_threshold': '15',
},
'PSU_INFO|PSU 2': {
'presence': 'False',
'status': 'True',
'temp': '55',
'temp_threshold': '100',
'voltage': '10',
'voltage_min_threshold': '8',
'voltage_max_threshold': '15',
},
'PSU_INFO|PSU 3': {
'presence': 'True',
'status': 'False',
'temp': '55',
'temp_threshold': '100',
'voltage': '10',
'voltage_min_threshold': '8',
'voltage_max_threshold': '15',
},
'PSU_INFO|PSU 4': {
'presence': 'True',
'status': 'True',
'temp': '101',
'temp_threshold': '100',
'voltage': '10',
'voltage_min_threshold': '8',
'voltage_max_threshold': '15',
},
'PSU_INFO|PSU 5': {
'presence': 'True',
'status': 'True',
'temp': '55',
'temp_threshold': '100',
'voltage': '10',
'voltage_min_threshold': '12',
'voltage_max_threshold': '15',
}
})
checker = HardwareChecker()
config = Config()
checker.check(config)
assert 'ASIC' in checker._info
assert checker._info['ASIC'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'fan1' in checker._info
assert checker._info['fan1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'fan2' in checker._info
assert checker._info['fan2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'fan3' in checker._info
assert checker._info['fan3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'fan4' in checker._info
assert checker._info['fan4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'PSU 1' in checker._info
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
assert 'PSU 2' in checker._info
assert checker._info['PSU 2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'PSU 3' in checker._info
assert checker._info['PSU 3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'PSU 4' in checker._info
assert checker._info['PSU 4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert 'PSU 5' in checker._info
assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK