[system-health] Convert to Python 3 (#5886)
- Convert system-health scripts to Python 3 - Build and install system-health as a Python 3 wheel - Also convert newlines from DOS to UNIX
This commit is contained in:
parent
62662acbd5
commit
566ea4f601
@ -201,12 +201,6 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $PLATF
|
||||
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_PDDF_COMMON_PY2_WHEEL_NAME
|
||||
{% endif %}
|
||||
|
||||
# Install system-health Python 2 package
|
||||
SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}})
|
||||
sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
||||
|
||||
# Install sonic-platform-common Python 3 package
|
||||
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
|
||||
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
|
||||
@ -219,6 +213,12 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install thrift
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install thrift==0.13.0
|
||||
{% endif %}
|
||||
|
||||
# Install system-health Python 3 package
|
||||
SYSTEM_HEALTH_PY3_WHEEL_NAME=$(basename {{system_health_py3_wheel_path}})
|
||||
sudo cp {{system_health_py3_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install $SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||
|
||||
# Install prerequisites needed for installing the Python m2crypto package, used by sonic-utilities
|
||||
# These packages can be uninstalled after intallation
|
||||
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install build-essential libssl-dev swig
|
||||
|
@ -1,9 +1,9 @@
|
||||
# system health python2 wheel
|
||||
# system health Python wheel
|
||||
|
||||
SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl
|
||||
SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
|
||||
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
|
||||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2
|
||||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE)
|
||||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
|
||||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SWSSSDK_PY3) $(SONIC_CONFIG_ENGINE_PY3)
|
||||
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
|
||||
|
||||
export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
||||
export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
||||
|
@ -1,2 +1,2 @@
|
||||
from . import hardware_checker
|
||||
from . import service_checker
|
||||
from . import hardware_checker
|
||||
from . import service_checker
|
||||
|
@ -1,144 +1,144 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from sonic_py_common import device_info
|
||||
|
||||
|
||||
class Config(object):
|
||||
"""
|
||||
Manage configuration of system health.
|
||||
"""
|
||||
|
||||
# Default system health check interval
|
||||
DEFAULT_INTERVAL = 60
|
||||
|
||||
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
|
||||
DEFAULT_BOOTUP_TIMEOUT = 300
|
||||
|
||||
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
|
||||
# override the default behavior.
|
||||
DEFAULT_LED_CONFIG = {
|
||||
'fault': 'red',
|
||||
'normal': 'green',
|
||||
'booting': 'orange_blink'
|
||||
}
|
||||
|
||||
# System health configuration file name
|
||||
CONFIG_FILE = 'system_health_monitoring_config.json'
|
||||
|
||||
# Monit service configuration file path
|
||||
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
|
||||
|
||||
# Monit service start delay configuration entry
|
||||
MONIT_START_DELAY_CONFIG = 'with start delay'
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
|
||||
"""
|
||||
self.platform_name = device_info.get_platform()
|
||||
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def config_file_exists(self):
|
||||
return os.path.exists(self._config_file)
|
||||
|
||||
def load_config(self):
|
||||
"""
|
||||
Load the configuration file from disk.
|
||||
1. If there is no configuration file, current config entries will reset to default value
|
||||
2. Only read the configuration file is last_mtime changes for better performance
|
||||
3. If there is any format issues in configuration file, current config entries will reset to default value
|
||||
:return:
|
||||
"""
|
||||
if not self.config_file_exists():
|
||||
if self._last_mtime is not None:
|
||||
self._reset()
|
||||
return
|
||||
|
||||
mtime = os.stat(self._config_file)
|
||||
if mtime != self._last_mtime:
|
||||
try:
|
||||
self._last_mtime = mtime
|
||||
with open(self._config_file, 'r') as f:
|
||||
self.config_data = json.load(f)
|
||||
|
||||
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
|
||||
self.ignore_services = self._get_list_data('services_to_ignore')
|
||||
self.ignore_devices = self._get_list_data('devices_to_ignore')
|
||||
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
|
||||
except Exception as e:
|
||||
self._reset()
|
||||
|
||||
def _reset(self):
|
||||
"""
|
||||
Reset current configuration entry to default value
|
||||
:return:
|
||||
"""
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def get_led_color(self, status):
|
||||
"""
|
||||
Get desired LED color according to the input status
|
||||
:param status: System health status
|
||||
:return: StringLED color
|
||||
"""
|
||||
if self.config_data and 'led_color' in self.config_data:
|
||||
if status in self.config_data['led_color']:
|
||||
return self.config_data['led_color'][status]
|
||||
|
||||
return self.DEFAULT_LED_CONFIG[status]
|
||||
|
||||
def get_bootup_timeout(self):
|
||||
"""
|
||||
Get boot up timeout from monit configuration file.
|
||||
1. If monit configuration file does not exist, return default value
|
||||
2. If there is any exception while parsing monit config, return default value
|
||||
:return: Integer timeout value
|
||||
"""
|
||||
if not os.path.exists(Config.MONIT_CONFIG_FILE):
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
try:
|
||||
with open(Config.MONIT_CONFIG_FILE) as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
pos = line.find('#')
|
||||
if pos == 0:
|
||||
continue
|
||||
|
||||
line = line[:pos]
|
||||
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
|
||||
if pos != -1:
|
||||
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
|
||||
except Exception:
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
def _get_list_data(self, key):
|
||||
"""
|
||||
Get list type configuration data by key and remove duplicate element.
|
||||
:param key: Key of the configuration entry
|
||||
:return: A set of configuration data if key exists
|
||||
"""
|
||||
if key in self.config_data:
|
||||
data = self.config_data[key]
|
||||
if isinstance(data, list):
|
||||
return set(data)
|
||||
return None
|
||||
import json
|
||||
import os
|
||||
|
||||
from sonic_py_common import device_info
|
||||
|
||||
|
||||
class Config(object):
|
||||
"""
|
||||
Manage configuration of system health.
|
||||
"""
|
||||
|
||||
# Default system health check interval
|
||||
DEFAULT_INTERVAL = 60
|
||||
|
||||
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
|
||||
DEFAULT_BOOTUP_TIMEOUT = 300
|
||||
|
||||
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
|
||||
# override the default behavior.
|
||||
DEFAULT_LED_CONFIG = {
|
||||
'fault': 'red',
|
||||
'normal': 'green',
|
||||
'booting': 'orange_blink'
|
||||
}
|
||||
|
||||
# System health configuration file name
|
||||
CONFIG_FILE = 'system_health_monitoring_config.json'
|
||||
|
||||
# Monit service configuration file path
|
||||
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
|
||||
|
||||
# Monit service start delay configuration entry
|
||||
MONIT_START_DELAY_CONFIG = 'with start delay'
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
|
||||
"""
|
||||
self.platform_name = device_info.get_platform()
|
||||
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def config_file_exists(self):
|
||||
return os.path.exists(self._config_file)
|
||||
|
||||
def load_config(self):
|
||||
"""
|
||||
Load the configuration file from disk.
|
||||
1. If there is no configuration file, current config entries will reset to default value
|
||||
2. Only read the configuration file is last_mtime changes for better performance
|
||||
3. If there is any format issues in configuration file, current config entries will reset to default value
|
||||
:return:
|
||||
"""
|
||||
if not self.config_file_exists():
|
||||
if self._last_mtime is not None:
|
||||
self._reset()
|
||||
return
|
||||
|
||||
mtime = os.stat(self._config_file)
|
||||
if mtime != self._last_mtime:
|
||||
try:
|
||||
self._last_mtime = mtime
|
||||
with open(self._config_file, 'r') as f:
|
||||
self.config_data = json.load(f)
|
||||
|
||||
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
|
||||
self.ignore_services = self._get_list_data('services_to_ignore')
|
||||
self.ignore_devices = self._get_list_data('devices_to_ignore')
|
||||
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
|
||||
except Exception as e:
|
||||
self._reset()
|
||||
|
||||
def _reset(self):
|
||||
"""
|
||||
Reset current configuration entry to default value
|
||||
:return:
|
||||
"""
|
||||
self._last_mtime = None
|
||||
self.config_data = None
|
||||
self.interval = Config.DEFAULT_INTERVAL
|
||||
self.ignore_services = None
|
||||
self.ignore_devices = None
|
||||
self.user_defined_checkers = None
|
||||
|
||||
def get_led_color(self, status):
|
||||
"""
|
||||
Get desired LED color according to the input status
|
||||
:param status: System health status
|
||||
:return: StringLED color
|
||||
"""
|
||||
if self.config_data and 'led_color' in self.config_data:
|
||||
if status in self.config_data['led_color']:
|
||||
return self.config_data['led_color'][status]
|
||||
|
||||
return self.DEFAULT_LED_CONFIG[status]
|
||||
|
||||
def get_bootup_timeout(self):
|
||||
"""
|
||||
Get boot up timeout from monit configuration file.
|
||||
1. If monit configuration file does not exist, return default value
|
||||
2. If there is any exception while parsing monit config, return default value
|
||||
:return: Integer timeout value
|
||||
"""
|
||||
if not os.path.exists(Config.MONIT_CONFIG_FILE):
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
try:
|
||||
with open(Config.MONIT_CONFIG_FILE) as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
pos = line.find('#')
|
||||
if pos == 0:
|
||||
continue
|
||||
|
||||
line = line[:pos]
|
||||
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
|
||||
if pos != -1:
|
||||
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
|
||||
except Exception:
|
||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||
|
||||
def _get_list_data(self, key):
|
||||
"""
|
||||
Get list type configuration data by key and remove duplicate element.
|
||||
:param key: Key of the configuration entry
|
||||
:return: A set of configuration data if key exists
|
||||
"""
|
||||
if key in self.config_data:
|
||||
data = self.config_data[key]
|
||||
if isinstance(data, list):
|
||||
return set(data)
|
||||
return None
|
||||
|
@ -1,248 +1,248 @@
|
||||
from natsort import natsorted
|
||||
from swsssdk import SonicV2Connector
|
||||
|
||||
from .health_checker import HealthChecker
|
||||
|
||||
|
||||
class HardwareChecker(HealthChecker):
|
||||
"""
|
||||
Check system hardware status. For now, it checks ASIC, PSU and fan status.
|
||||
"""
|
||||
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
|
||||
FAN_TABLE_NAME = 'FAN_INFO'
|
||||
PSU_TABLE_NAME = 'PSU_INFO'
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
self._db = SonicV2Connector(host="127.0.0.1")
|
||||
self._db.connect(self._db.STATE_DB)
|
||||
|
||||
def get_category(self):
|
||||
return 'Hardware'
|
||||
|
||||
def check(self, config):
|
||||
self.reset()
|
||||
self._check_asic_status(config)
|
||||
self._check_fan_status(config)
|
||||
self._check_psu_status(config)
|
||||
|
||||
def _check_asic_status(self, config):
|
||||
"""
|
||||
Check if ASIC temperature is in valid range.
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'asic' in config.ignore_devices:
|
||||
return
|
||||
|
||||
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
|
||||
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
|
||||
if not temperature:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
|
||||
elif not temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
|
||||
temperature,
|
||||
temperature_threshold))
|
||||
else:
|
||||
self.set_object_ok('ASIC', 'ASIC')
|
||||
except ValueError as e:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
|
||||
temperature_threshold))
|
||||
|
||||
def _check_fan_status(self, config):
|
||||
"""
|
||||
Check fan status including:
|
||||
1. Check all fans are present
|
||||
2. Check all fans are in good state
|
||||
3. Check fan speed is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'fan' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
||||
speed = data_dict.get('speed', None)
|
||||
speed_target = data_dict.get('speed_target', None)
|
||||
speed_tolerance = data_dict.get('speed_tolerance', None)
|
||||
if not speed:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
||||
continue
|
||||
elif not speed_target:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
||||
continue
|
||||
elif not speed_tolerance:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
speed = float(speed)
|
||||
speed_target = float(speed_target)
|
||||
speed_tolerance = float(speed_tolerance)
|
||||
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
||||
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
||||
if speed < speed_min_th or speed > speed_max_th:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
||||
speed,
|
||||
speed_min_th,
|
||||
speed_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
||||
name,
|
||||
speed,
|
||||
speed_target,
|
||||
speed_tolerance))
|
||||
continue
|
||||
|
||||
self.set_object_ok('Fan', name)
|
||||
|
||||
def _check_psu_status(self, config):
|
||||
"""
|
||||
Check PSU status including:
|
||||
1. Check all PSUs are present
|
||||
2. Check all PSUs are power on
|
||||
3. Check PSU temperature is in valid range
|
||||
4. Check PSU voltage is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'psu' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
|
||||
temperature = data_dict.get('temp', None)
|
||||
temperature_threshold = data_dict.get('temp_threshold', None)
|
||||
if temperature is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
|
||||
continue
|
||||
elif temperature_threshold is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} temperature is too hot, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
|
||||
voltage = data_dict.get('voltage', None)
|
||||
voltage_min_th = data_dict.get('voltage_min_threshold', None)
|
||||
voltage_max_th = data_dict.get('voltage_max_threshold', None)
|
||||
if voltage is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_min_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage minimum threshold data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_max_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage maximum threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
voltage = float(voltage)
|
||||
voltage_min_th = float(voltage_min_th)
|
||||
voltage_max_th = float(voltage_max_th)
|
||||
if voltage < voltage_min_th or voltage > voltage_max_th:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
self.set_object_ok('PSU', name)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
@classmethod
|
||||
def _ignore_check(cls, ignore_set, category, object_name, check_point):
|
||||
if not ignore_set:
|
||||
return False
|
||||
|
||||
if '{}.{}'.format(category, check_point) in ignore_set:
|
||||
return True
|
||||
elif '{}.{}'.format(object_name, check_point) in ignore_set:
|
||||
return True
|
||||
return False
|
||||
from natsort import natsorted
|
||||
from swsssdk import SonicV2Connector
|
||||
|
||||
from .health_checker import HealthChecker
|
||||
|
||||
|
||||
class HardwareChecker(HealthChecker):
|
||||
"""
|
||||
Check system hardware status. For now, it checks ASIC, PSU and fan status.
|
||||
"""
|
||||
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
|
||||
FAN_TABLE_NAME = 'FAN_INFO'
|
||||
PSU_TABLE_NAME = 'PSU_INFO'
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
self._db = SonicV2Connector(host="127.0.0.1")
|
||||
self._db.connect(self._db.STATE_DB)
|
||||
|
||||
def get_category(self):
|
||||
return 'Hardware'
|
||||
|
||||
def check(self, config):
|
||||
self.reset()
|
||||
self._check_asic_status(config)
|
||||
self._check_fan_status(config)
|
||||
self._check_psu_status(config)
|
||||
|
||||
def _check_asic_status(self, config):
|
||||
"""
|
||||
Check if ASIC temperature is in valid range.
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'asic' in config.ignore_devices:
|
||||
return
|
||||
|
||||
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
|
||||
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
|
||||
if not temperature:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
|
||||
elif not temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
|
||||
temperature,
|
||||
temperature_threshold))
|
||||
else:
|
||||
self.set_object_ok('ASIC', 'ASIC')
|
||||
except ValueError as e:
|
||||
self.set_object_not_ok('ASIC', 'ASIC',
|
||||
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
|
||||
temperature_threshold))
|
||||
|
||||
def _check_fan_status(self, config):
|
||||
"""
|
||||
Check fan status including:
|
||||
1. Check all fans are present
|
||||
2. Check all fans are in good state
|
||||
3. Check fan speed is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'fan' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
||||
speed = data_dict.get('speed', None)
|
||||
speed_target = data_dict.get('speed_target', None)
|
||||
speed_tolerance = data_dict.get('speed_tolerance', None)
|
||||
if not speed:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
||||
continue
|
||||
elif not speed_target:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
||||
continue
|
||||
elif not speed_tolerance:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
speed = float(speed)
|
||||
speed_target = float(speed_target)
|
||||
speed_tolerance = float(speed_tolerance)
|
||||
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
||||
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
||||
if speed < speed_min_th or speed > speed_max_th:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
||||
speed,
|
||||
speed_min_th,
|
||||
speed_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
||||
name,
|
||||
speed,
|
||||
speed_target,
|
||||
speed_tolerance))
|
||||
continue
|
||||
|
||||
self.set_object_ok('Fan', name)
|
||||
|
||||
def _check_psu_status(self, config):
|
||||
"""
|
||||
Check PSU status including:
|
||||
1. Check all PSUs are present
|
||||
2. Check all PSUs are power on
|
||||
3. Check PSU temperature is in valid range
|
||||
4. Check PSU voltage is in valid range
|
||||
:param config: Health checker configuration
|
||||
:return:
|
||||
"""
|
||||
if config.ignore_devices and 'psu' in config.ignore_devices:
|
||||
return
|
||||
|
||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
|
||||
if not keys:
|
||||
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
|
||||
return
|
||||
|
||||
for key in natsorted(keys):
|
||||
key_list = key.split('|')
|
||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
|
||||
continue
|
||||
|
||||
name = key_list[1]
|
||||
if config.ignore_devices and name in config.ignore_devices:
|
||||
continue
|
||||
|
||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||
presence = data_dict.get('presence', 'false')
|
||||
if presence.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
|
||||
continue
|
||||
|
||||
status = data_dict.get('status', 'false')
|
||||
if status.lower() != 'true':
|
||||
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
|
||||
temperature = data_dict.get('temp', None)
|
||||
temperature_threshold = data_dict.get('temp_threshold', None)
|
||||
if temperature is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
|
||||
continue
|
||||
elif temperature_threshold is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
temperature = float(temperature)
|
||||
temperature_threshold = float(temperature_threshold)
|
||||
if temperature > temperature_threshold:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} temperature is too hot, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
|
||||
name, temperature,
|
||||
temperature_threshold))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
|
||||
voltage = data_dict.get('voltage', None)
|
||||
voltage_min_th = data_dict.get('voltage_min_threshold', None)
|
||||
voltage_max_th = data_dict.get('voltage_max_threshold', None)
|
||||
if voltage is None:
|
||||
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_min_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage minimum threshold data for {}'.format(name))
|
||||
continue
|
||||
elif voltage_max_th is None:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Failed to get voltage maximum threshold data for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
voltage = float(voltage)
|
||||
voltage_min_th = float(voltage_min_th)
|
||||
voltage_max_th = float(voltage_max_th)
|
||||
if voltage < voltage_min_th or voltage > voltage_max_th:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('PSU', name,
|
||||
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
|
||||
voltage,
|
||||
voltage_min_th,
|
||||
voltage_max_th))
|
||||
continue
|
||||
self.set_object_ok('PSU', name)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
@classmethod
|
||||
def _ignore_check(cls, ignore_set, category, object_name, check_point):
|
||||
if not ignore_set:
|
||||
return False
|
||||
|
||||
if '{}.{}'.format(category, check_point) in ignore_set:
|
||||
return True
|
||||
elif '{}.{}'.format(object_name, check_point) in ignore_set:
|
||||
return True
|
||||
return False
|
||||
|
@ -1,86 +1,86 @@
|
||||
class HealthChecker(object):
|
||||
"""
|
||||
Base class for health checker. A checker is an object that performs system health check for a particular category,
|
||||
it collects and stores information after the check.
|
||||
"""
|
||||
INFO_FIELD_OBJECT_TYPE = 'type'
|
||||
INFO_FIELD_OBJECT_STATUS = 'status'
|
||||
INFO_FIELD_OBJECT_MSG = 'message'
|
||||
|
||||
STATUS_OK = 'OK'
|
||||
STATUS_NOT_OK = 'Not OK'
|
||||
|
||||
summary = STATUS_OK
|
||||
|
||||
def __init__(self):
|
||||
self._info = {}
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the status of the checker. Called every time before the check.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_category(self):
|
||||
"""
|
||||
Get category of the checker.
|
||||
:return: String category
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_info(self):
|
||||
"""
|
||||
Get information of the checker. A checker usually checks a few objects and each object status will be put to
|
||||
self._info.
|
||||
:return: Check result.
|
||||
"""
|
||||
return self._info
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Perform the check.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def add_info(self, object_name, key, value):
|
||||
"""
|
||||
Add check result for an object.
|
||||
:param object_name: Object name.
|
||||
:param key: Object attribute name.
|
||||
:param value: Object attribute value.
|
||||
:return:
|
||||
"""
|
||||
if object_name not in self._info:
|
||||
self._info[object_name] = {}
|
||||
|
||||
self._info[object_name][key] = value
|
||||
|
||||
def set_object_not_ok(self, object_type, object_name, message):
|
||||
"""
|
||||
Set that an object is not OK.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:param message: A message to describe what is wrong with the object.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
|
||||
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
||||
|
||||
def set_object_ok(self, object_type, object_name):
|
||||
"""
|
||||
Set that an object is in good state.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)
|
||||
class HealthChecker(object):
|
||||
"""
|
||||
Base class for health checker. A checker is an object that performs system health check for a particular category,
|
||||
it collects and stores information after the check.
|
||||
"""
|
||||
INFO_FIELD_OBJECT_TYPE = 'type'
|
||||
INFO_FIELD_OBJECT_STATUS = 'status'
|
||||
INFO_FIELD_OBJECT_MSG = 'message'
|
||||
|
||||
STATUS_OK = 'OK'
|
||||
STATUS_NOT_OK = 'Not OK'
|
||||
|
||||
summary = STATUS_OK
|
||||
|
||||
def __init__(self):
|
||||
self._info = {}
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the status of the checker. Called every time before the check.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_category(self):
|
||||
"""
|
||||
Get category of the checker.
|
||||
:return: String category
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_info(self):
|
||||
"""
|
||||
Get information of the checker. A checker usually checks a few objects and each object status will be put to
|
||||
self._info.
|
||||
:return: Check result.
|
||||
"""
|
||||
return self._info
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Perform the check.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def add_info(self, object_name, key, value):
|
||||
"""
|
||||
Add check result for an object.
|
||||
:param object_name: Object name.
|
||||
:param key: Object attribute name.
|
||||
:param value: Object attribute value.
|
||||
:return:
|
||||
"""
|
||||
if object_name not in self._info:
|
||||
self._info[object_name] = {}
|
||||
|
||||
self._info[object_name][key] = value
|
||||
|
||||
def set_object_not_ok(self, object_type, object_name, message):
|
||||
"""
|
||||
Set that an object is not OK.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:param message: A message to describe what is wrong with the object.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
|
||||
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
||||
|
||||
def set_object_ok(self, object_type, object_name):
|
||||
"""
|
||||
Set that an object is in good state.
|
||||
:param object_type: Object type.
|
||||
:param object_name: Object name.
|
||||
:return:
|
||||
"""
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
|
||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)
|
||||
|
@ -1,101 +1,101 @@
|
||||
class HealthCheckerManager(object):
|
||||
"""
|
||||
Manage all system health checkers and system health configuration.
|
||||
"""
|
||||
STATE_BOOTING = 'booting'
|
||||
STATE_RUNNING = 'running'
|
||||
boot_timeout = None
|
||||
|
||||
def __init__(self):
|
||||
self._checkers = []
|
||||
self._state = self.STATE_BOOTING
|
||||
|
||||
from .config import Config
|
||||
self.config = Config()
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize the manager. Create service checker and hardware checker by default.
|
||||
:return:
|
||||
"""
|
||||
from .service_checker import ServiceChecker
|
||||
from .hardware_checker import HardwareChecker
|
||||
self._checkers.append(ServiceChecker())
|
||||
self._checkers.append(HardwareChecker())
|
||||
|
||||
def check(self, chassis):
|
||||
"""
|
||||
Load new configuration if any and perform the system health check for all existing checkers.
|
||||
:param chassis: A chassis object.
|
||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||
contains the status for all objects that was checked.
|
||||
"""
|
||||
from .health_checker import HealthChecker
|
||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||
stats = {}
|
||||
self.config.load_config()
|
||||
# check state first to avoid user change boot timeout in configuration file
|
||||
# after finishing system boot
|
||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
||||
self._set_system_led(chassis, self.config, 'booting')
|
||||
return self._state, stats
|
||||
|
||||
for checker in self._checkers:
|
||||
self._do_check(checker, stats)
|
||||
|
||||
if self.config.user_defined_checkers:
|
||||
from .user_defined_checker import UserDefinedChecker
|
||||
for udc in self.config.user_defined_checkers:
|
||||
checker = UserDefinedChecker(udc)
|
||||
self._do_check(checker, stats)
|
||||
|
||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||
self._set_system_led(chassis, self.config, led_status)
|
||||
|
||||
return self._state, stats
|
||||
|
||||
def _do_check(self, checker, stats):
|
||||
"""
|
||||
Do check for a particular checker and collect the check statistic.
|
||||
:param checker: A checker object.
|
||||
:param stats: Check statistic.
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
checker.check(self.config)
|
||||
category = checker.get_category()
|
||||
info = checker.get_info()
|
||||
if category not in stats:
|
||||
stats[category] = info
|
||||
else:
|
||||
stats[category].update(info)
|
||||
except Exception as e:
|
||||
from .health_checker import HealthChecker
|
||||
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
||||
entry = {str(checker): {
|
||||
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
||||
}}
|
||||
if 'Internal' not in stats:
|
||||
stats['Internal'] = entry
|
||||
else:
|
||||
stats['Internal'].update(entry)
|
||||
|
||||
def _is_system_booting(self):
|
||||
from .utils import get_uptime
|
||||
uptime = get_uptime()
|
||||
if not self.boot_timeout:
|
||||
self.boot_timeout = self.config.get_bootup_timeout()
|
||||
booting = uptime < self.boot_timeout
|
||||
if not booting:
|
||||
self._state = self.STATE_RUNNING
|
||||
return booting
|
||||
|
||||
def _set_system_led(self, chassis, config, status):
|
||||
try:
|
||||
chassis.set_status_led(config.get_led_color(status))
|
||||
except NotImplementedError:
|
||||
print('chassis.set_status_led is not implemented')
|
||||
except Exception as e:
|
||||
print('Failed to set system led due to - {}'.format(repr(e)))
|
||||
class HealthCheckerManager(object):
|
||||
"""
|
||||
Manage all system health checkers and system health configuration.
|
||||
"""
|
||||
STATE_BOOTING = 'booting'
|
||||
STATE_RUNNING = 'running'
|
||||
boot_timeout = None
|
||||
|
||||
def __init__(self):
|
||||
self._checkers = []
|
||||
self._state = self.STATE_BOOTING
|
||||
|
||||
from .config import Config
|
||||
self.config = Config()
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize the manager. Create service checker and hardware checker by default.
|
||||
:return:
|
||||
"""
|
||||
from .service_checker import ServiceChecker
|
||||
from .hardware_checker import HardwareChecker
|
||||
self._checkers.append(ServiceChecker())
|
||||
self._checkers.append(HardwareChecker())
|
||||
|
||||
def check(self, chassis):
|
||||
"""
|
||||
Load new configuration if any and perform the system health check for all existing checkers.
|
||||
:param chassis: A chassis object.
|
||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||
contains the status for all objects that was checked.
|
||||
"""
|
||||
from .health_checker import HealthChecker
|
||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||
stats = {}
|
||||
self.config.load_config()
|
||||
# check state first to avoid user change boot timeout in configuration file
|
||||
# after finishing system boot
|
||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
||||
self._set_system_led(chassis, self.config, 'booting')
|
||||
return self._state, stats
|
||||
|
||||
for checker in self._checkers:
|
||||
self._do_check(checker, stats)
|
||||
|
||||
if self.config.user_defined_checkers:
|
||||
from .user_defined_checker import UserDefinedChecker
|
||||
for udc in self.config.user_defined_checkers:
|
||||
checker = UserDefinedChecker(udc)
|
||||
self._do_check(checker, stats)
|
||||
|
||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||
self._set_system_led(chassis, self.config, led_status)
|
||||
|
||||
return self._state, stats
|
||||
|
||||
def _do_check(self, checker, stats):
|
||||
"""
|
||||
Do check for a particular checker and collect the check statistic.
|
||||
:param checker: A checker object.
|
||||
:param stats: Check statistic.
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
checker.check(self.config)
|
||||
category = checker.get_category()
|
||||
info = checker.get_info()
|
||||
if category not in stats:
|
||||
stats[category] = info
|
||||
else:
|
||||
stats[category].update(info)
|
||||
except Exception as e:
|
||||
from .health_checker import HealthChecker
|
||||
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
||||
entry = {str(checker): {
|
||||
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
||||
}}
|
||||
if 'Internal' not in stats:
|
||||
stats['Internal'] = entry
|
||||
else:
|
||||
stats['Internal'].update(entry)
|
||||
|
||||
def _is_system_booting(self):
|
||||
from .utils import get_uptime
|
||||
uptime = get_uptime()
|
||||
if not self.boot_timeout:
|
||||
self.boot_timeout = self.config.get_bootup_timeout()
|
||||
booting = uptime < self.boot_timeout
|
||||
if not booting:
|
||||
self._state = self.STATE_RUNNING
|
||||
return booting
|
||||
|
||||
def _set_system_led(self, chassis, config, status):
|
||||
try:
|
||||
chassis.set_status_led(config.get_led_color(status))
|
||||
except NotImplementedError:
|
||||
print('chassis.set_status_led is not implemented')
|
||||
except Exception as e:
|
||||
print('Failed to set system led due to - {}'.format(repr(e)))
|
||||
|
@ -1,72 +1,72 @@
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class ServiceChecker(HealthChecker):
|
||||
"""
|
||||
Checker that checks critical system service status via monit service.
|
||||
"""
|
||||
|
||||
# Command to query the status of monit service.
|
||||
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
||||
|
||||
# Command to get summary of critical system service.
|
||||
CHECK_CMD = 'monit summary -B'
|
||||
MIN_CHECK_CMD_LINES = 3
|
||||
|
||||
# Expect status for different system service category.
|
||||
EXPECT_STATUS_DICT = {
|
||||
'System': 'Running',
|
||||
'Process': 'Running',
|
||||
'Filesystem': 'Accessible',
|
||||
'Program': 'Status ok'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return 'Services'
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
||||
process and file system.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
||||
if output != 'active':
|
||||
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
||||
return
|
||||
|
||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||
lines = output.splitlines()
|
||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
status_begin = lines[1].find('Status')
|
||||
type_begin = lines[1].find('Type')
|
||||
if status_begin < 0 or type_begin < 0:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
for line in lines[2:]:
|
||||
name = line[0:status_begin].strip()
|
||||
if config.ignore_services and name in config.ignore_services:
|
||||
continue
|
||||
status = line[status_begin:type_begin].strip()
|
||||
service_type = line[type_begin:].strip()
|
||||
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
||||
continue
|
||||
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
||||
if expect_status != status:
|
||||
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
||||
else:
|
||||
self.set_object_ok(service_type, name)
|
||||
return
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class ServiceChecker(HealthChecker):
|
||||
"""
|
||||
Checker that checks critical system service status via monit service.
|
||||
"""
|
||||
|
||||
# Command to query the status of monit service.
|
||||
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
||||
|
||||
# Command to get summary of critical system service.
|
||||
CHECK_CMD = 'monit summary -B'
|
||||
MIN_CHECK_CMD_LINES = 3
|
||||
|
||||
# Expect status for different system service category.
|
||||
EXPECT_STATUS_DICT = {
|
||||
'System': 'Running',
|
||||
'Process': 'Running',
|
||||
'Filesystem': 'Accessible',
|
||||
'Program': 'Status ok'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
HealthChecker.__init__(self)
|
||||
|
||||
def reset(self):
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return 'Services'
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
||||
process and file system.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
||||
if output != 'active':
|
||||
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
||||
return
|
||||
|
||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||
lines = output.splitlines()
|
||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
status_begin = lines[1].find('Status')
|
||||
type_begin = lines[1].find('Type')
|
||||
if status_begin < 0 or type_begin < 0:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
return
|
||||
|
||||
for line in lines[2:]:
|
||||
name = line[0:status_begin].strip()
|
||||
if config.ignore_services and name in config.ignore_services:
|
||||
continue
|
||||
status = line[status_begin:type_begin].strip()
|
||||
service_type = line[type_begin:].strip()
|
||||
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
||||
continue
|
||||
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
||||
if expect_status != status:
|
||||
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
||||
else:
|
||||
self.set_object_ok(service_type, name)
|
||||
return
|
||||
|
@ -1,11 +1,11 @@
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "amber",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
||||
{
|
||||
"services_to_ignore": [],
|
||||
"devices_to_ignore": [],
|
||||
"user_defined_checkers": [],
|
||||
"polling_interval": 60,
|
||||
"led_color": {
|
||||
"fault": "amber",
|
||||
"normal": "green",
|
||||
"booting": "orange_blink"
|
||||
}
|
||||
}
|
||||
|
@ -1,88 +1,89 @@
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class UserDefinedChecker(HealthChecker):
|
||||
"""
|
||||
User could implement a script or program to perform customize check for particular system. In order to enable a
|
||||
user defined checker:
|
||||
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
|
||||
that can be executed by shell. For example: "python my_checker.py".
|
||||
2. The command output must match the following pattern:
|
||||
${UserDefineCategory}
|
||||
${Object1}:${ObjectStatusMessage1}
|
||||
${Object2}:${ObjectStatusMessage2}
|
||||
|
||||
An example of the command output:
|
||||
MyCategory
|
||||
Device1:OK
|
||||
Device2:OK
|
||||
Device3:Out of power
|
||||
"""
|
||||
def __init__(self, cmd):
|
||||
"""
|
||||
Constructor.
|
||||
:param cmd: Command string of the user defined checker.
|
||||
"""
|
||||
HealthChecker.__init__(self)
|
||||
self._cmd = cmd
|
||||
self._category = None
|
||||
|
||||
def reset(self):
|
||||
self._category = 'UserDefine'
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return self._category
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Execute the user defined command and parse the output.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
|
||||
output = utils.run_command(self._cmd)
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
output = output.strip()
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
raw_lines = output.splitlines()
|
||||
if not raw_lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
lines = []
|
||||
for line in raw_lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
lines.append(line)
|
||||
|
||||
if not lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
self._category = lines[0]
|
||||
if len(lines) > 1:
|
||||
for line in lines[1:]:
|
||||
pos = line.find(':')
|
||||
if pos == -1:
|
||||
continue
|
||||
obj_name = line[:pos].strip()
|
||||
msg = line[pos + 1:].strip()
|
||||
if msg != 'OK':
|
||||
self.set_object_not_ok('UserDefine', obj_name, msg)
|
||||
else:
|
||||
self.set_object_ok('UserDefine', obj_name)
|
||||
return
|
||||
|
||||
def __str__(self):
|
||||
return 'UserDefinedChecker - {}'.format(self._cmd)
|
||||
from .health_checker import HealthChecker
|
||||
from . import utils
|
||||
|
||||
|
||||
class UserDefinedChecker(HealthChecker):
|
||||
"""
|
||||
User could implement a script or program to perform customize check for particular system. In order to enable a
|
||||
user defined checker:
|
||||
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
|
||||
that can be executed by shell. For example: "python my_checker.py".
|
||||
2. The command output must match the following pattern:
|
||||
${UserDefineCategory}
|
||||
${Object1}:${ObjectStatusMessage1}
|
||||
${Object2}:${ObjectStatusMessage2}
|
||||
|
||||
An example of the command output:
|
||||
MyCategory
|
||||
Device1:OK
|
||||
Device2:OK
|
||||
Device3:Out of power
|
||||
"""
|
||||
|
||||
def __init__(self, cmd):
|
||||
"""
|
||||
Constructor.
|
||||
:param cmd: Command string of the user defined checker.
|
||||
"""
|
||||
HealthChecker.__init__(self)
|
||||
self._cmd = cmd
|
||||
self._category = None
|
||||
|
||||
def reset(self):
|
||||
self._category = 'UserDefine'
|
||||
self._info = {}
|
||||
|
||||
def get_category(self):
|
||||
return self._category
|
||||
|
||||
def check(self, config):
|
||||
"""
|
||||
Execute the user defined command and parse the output.
|
||||
:param config: Health checker configuration.
|
||||
:return:
|
||||
"""
|
||||
self.reset()
|
||||
|
||||
output = utils.run_command(self._cmd)
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
output = output.strip()
|
||||
if not output:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
raw_lines = output.splitlines()
|
||||
if not raw_lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
lines = []
|
||||
for line in raw_lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
lines.append(line)
|
||||
|
||||
if not lines:
|
||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||
return
|
||||
|
||||
self._category = lines[0]
|
||||
if len(lines) > 1:
|
||||
for line in lines[1:]:
|
||||
pos = line.find(':')
|
||||
if pos == -1:
|
||||
continue
|
||||
obj_name = line[:pos].strip()
|
||||
msg = line[pos + 1:].strip()
|
||||
if msg != 'OK':
|
||||
self.set_object_not_ok('UserDefine', obj_name, msg)
|
||||
else:
|
||||
self.set_object_ok('UserDefine', obj_name)
|
||||
return
|
||||
|
||||
def __str__(self):
|
||||
return 'UserDefinedChecker - {}'.format(self._cmd)
|
||||
|
@ -1,25 +1,25 @@
|
||||
import subprocess
|
||||
|
||||
|
||||
def run_command(command):
|
||||
"""
|
||||
Utility function to run an shell command and return the output.
|
||||
:param command: Shell command string.
|
||||
:return: Output of the shell command.
|
||||
"""
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
return process.communicate()[0].encode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_uptime():
|
||||
"""
|
||||
Utility to get the system up time.
|
||||
:return: System up time in seconds.
|
||||
"""
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
|
||||
return uptime_seconds
|
||||
import subprocess
|
||||
|
||||
|
||||
def run_command(command):
|
||||
"""
|
||||
Utility function to run an shell command and return the output.
|
||||
:param command: Shell command string.
|
||||
:return: Output of the shell command.
|
||||
"""
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
return process.communicate()[0].encode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_uptime():
|
||||
"""
|
||||
Utility to get the system up time.
|
||||
:return: System up time in seconds.
|
||||
"""
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
|
||||
return uptime_seconds
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
healthd
|
||||
|
@ -24,10 +24,10 @@ setup(
|
||||
scripts=[
|
||||
'scripts/healthd',
|
||||
],
|
||||
setup_requires= [
|
||||
setup_requires=[
|
||||
'pytest-runner'
|
||||
],
|
||||
tests_require = [
|
||||
tests_require=[
|
||||
'pytest',
|
||||
'mock>=2.0.0'
|
||||
],
|
||||
@ -40,10 +40,9 @@ setup(
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Natural Language :: English',
|
||||
'Operating System :: POSIX :: Linux',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Topic :: System :: Hardware',
|
||||
],
|
||||
keywords='SONiC sonic HEALTH health',
|
||||
test_suite='setup.get_test_suite'
|
||||
)
|
||||
|
||||
|
@ -22,4 +22,3 @@ class MockConnector(object):
|
||||
|
||||
def get_all(self, db_id, key):
|
||||
return MockConnector.data[key]
|
||||
|
||||
|
@ -73,8 +73,8 @@ def test_service_checker():
|
||||
'telemetry Does not exist Process\n' \
|
||||
'orchagent Running Process\n' \
|
||||
'root-overlay Accessible Filesystem\n' \
|
||||
'var-log Is not accessible Filesystem\n'
|
||||
|
||||
'var-log Is not accessible Filesystem\n'
|
||||
|
||||
checker = ServiceChecker()
|
||||
config = Config()
|
||||
checker.check(config)
|
||||
|
Reference in New Issue
Block a user