[system-health] Convert to Python 3 (#5886)
- Convert system-health scripts to Python 3 - Build and install system-health as a Python 3 wheel - Also convert newlines from DOS to UNIX
This commit is contained in:
parent
62662acbd5
commit
566ea4f601
@ -201,12 +201,6 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $PLATF
|
|||||||
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_PDDF_COMMON_PY2_WHEEL_NAME
|
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_PDDF_COMMON_PY2_WHEEL_NAME
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
# Install system-health Python 2 package
|
|
||||||
SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}})
|
|
||||||
sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
|
||||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $SYSTEM_HEALTH_PY2_WHEEL_NAME
|
|
||||||
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
|
|
||||||
|
|
||||||
# Install sonic-platform-common Python 3 package
|
# Install sonic-platform-common Python 3 package
|
||||||
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
|
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
|
||||||
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
|
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
|
||||||
@ -219,6 +213,12 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install thrift
|
|||||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install thrift==0.13.0
|
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install thrift==0.13.0
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
# Install system-health Python 3 package
|
||||||
|
SYSTEM_HEALTH_PY3_WHEEL_NAME=$(basename {{system_health_py3_wheel_path}})
|
||||||
|
sudo cp {{system_health_py3_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||||
|
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install $SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||||
|
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME
|
||||||
|
|
||||||
# Install prerequisites needed for installing the Python m2crypto package, used by sonic-utilities
|
# Install prerequisites needed for installing the Python m2crypto package, used by sonic-utilities
|
||||||
# These packages can be uninstalled after intallation
|
# These packages can be uninstalled after intallation
|
||||||
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install build-essential libssl-dev swig
|
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install build-essential libssl-dev swig
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
# system health python2 wheel
|
# system health Python wheel
|
||||||
|
|
||||||
SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl
|
SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
|
||||||
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
|
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
|
||||||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2
|
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
|
||||||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE)
|
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SWSSSDK_PY3) $(SONIC_CONFIG_ENGINE_PY3)
|
||||||
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
|
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)
|
||||||
|
|
||||||
export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
from . import hardware_checker
|
from . import hardware_checker
|
||||||
from . import service_checker
|
from . import service_checker
|
||||||
|
@ -1,144 +1,144 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from sonic_py_common import device_info
|
from sonic_py_common import device_info
|
||||||
|
|
||||||
|
|
||||||
class Config(object):
|
class Config(object):
|
||||||
"""
|
"""
|
||||||
Manage configuration of system health.
|
Manage configuration of system health.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Default system health check interval
|
# Default system health check interval
|
||||||
DEFAULT_INTERVAL = 60
|
DEFAULT_INTERVAL = 60
|
||||||
|
|
||||||
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
|
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
|
||||||
DEFAULT_BOOTUP_TIMEOUT = 300
|
DEFAULT_BOOTUP_TIMEOUT = 300
|
||||||
|
|
||||||
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
|
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
|
||||||
# override the default behavior.
|
# override the default behavior.
|
||||||
DEFAULT_LED_CONFIG = {
|
DEFAULT_LED_CONFIG = {
|
||||||
'fault': 'red',
|
'fault': 'red',
|
||||||
'normal': 'green',
|
'normal': 'green',
|
||||||
'booting': 'orange_blink'
|
'booting': 'orange_blink'
|
||||||
}
|
}
|
||||||
|
|
||||||
# System health configuration file name
|
# System health configuration file name
|
||||||
CONFIG_FILE = 'system_health_monitoring_config.json'
|
CONFIG_FILE = 'system_health_monitoring_config.json'
|
||||||
|
|
||||||
# Monit service configuration file path
|
# Monit service configuration file path
|
||||||
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
|
MONIT_CONFIG_FILE = '/etc/monit/monitrc'
|
||||||
|
|
||||||
# Monit service start delay configuration entry
|
# Monit service start delay configuration entry
|
||||||
MONIT_START_DELAY_CONFIG = 'with start delay'
|
MONIT_START_DELAY_CONFIG = 'with start delay'
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""
|
"""
|
||||||
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
|
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
|
||||||
"""
|
"""
|
||||||
self.platform_name = device_info.get_platform()
|
self.platform_name = device_info.get_platform()
|
||||||
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
|
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
|
||||||
self._last_mtime = None
|
self._last_mtime = None
|
||||||
self.config_data = None
|
self.config_data = None
|
||||||
self.interval = Config.DEFAULT_INTERVAL
|
self.interval = Config.DEFAULT_INTERVAL
|
||||||
self.ignore_services = None
|
self.ignore_services = None
|
||||||
self.ignore_devices = None
|
self.ignore_devices = None
|
||||||
self.user_defined_checkers = None
|
self.user_defined_checkers = None
|
||||||
|
|
||||||
def config_file_exists(self):
|
def config_file_exists(self):
|
||||||
return os.path.exists(self._config_file)
|
return os.path.exists(self._config_file)
|
||||||
|
|
||||||
def load_config(self):
|
def load_config(self):
|
||||||
"""
|
"""
|
||||||
Load the configuration file from disk.
|
Load the configuration file from disk.
|
||||||
1. If there is no configuration file, current config entries will reset to default value
|
1. If there is no configuration file, current config entries will reset to default value
|
||||||
2. Only read the configuration file is last_mtime changes for better performance
|
2. Only read the configuration file is last_mtime changes for better performance
|
||||||
3. If there is any format issues in configuration file, current config entries will reset to default value
|
3. If there is any format issues in configuration file, current config entries will reset to default value
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if not self.config_file_exists():
|
if not self.config_file_exists():
|
||||||
if self._last_mtime is not None:
|
if self._last_mtime is not None:
|
||||||
self._reset()
|
self._reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
mtime = os.stat(self._config_file)
|
mtime = os.stat(self._config_file)
|
||||||
if mtime != self._last_mtime:
|
if mtime != self._last_mtime:
|
||||||
try:
|
try:
|
||||||
self._last_mtime = mtime
|
self._last_mtime = mtime
|
||||||
with open(self._config_file, 'r') as f:
|
with open(self._config_file, 'r') as f:
|
||||||
self.config_data = json.load(f)
|
self.config_data = json.load(f)
|
||||||
|
|
||||||
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
|
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
|
||||||
self.ignore_services = self._get_list_data('services_to_ignore')
|
self.ignore_services = self._get_list_data('services_to_ignore')
|
||||||
self.ignore_devices = self._get_list_data('devices_to_ignore')
|
self.ignore_devices = self._get_list_data('devices_to_ignore')
|
||||||
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
|
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._reset()
|
self._reset()
|
||||||
|
|
||||||
def _reset(self):
|
def _reset(self):
|
||||||
"""
|
"""
|
||||||
Reset current configuration entry to default value
|
Reset current configuration entry to default value
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self._last_mtime = None
|
self._last_mtime = None
|
||||||
self.config_data = None
|
self.config_data = None
|
||||||
self.interval = Config.DEFAULT_INTERVAL
|
self.interval = Config.DEFAULT_INTERVAL
|
||||||
self.ignore_services = None
|
self.ignore_services = None
|
||||||
self.ignore_devices = None
|
self.ignore_devices = None
|
||||||
self.user_defined_checkers = None
|
self.user_defined_checkers = None
|
||||||
|
|
||||||
def get_led_color(self, status):
|
def get_led_color(self, status):
|
||||||
"""
|
"""
|
||||||
Get desired LED color according to the input status
|
Get desired LED color according to the input status
|
||||||
:param status: System health status
|
:param status: System health status
|
||||||
:return: StringLED color
|
:return: StringLED color
|
||||||
"""
|
"""
|
||||||
if self.config_data and 'led_color' in self.config_data:
|
if self.config_data and 'led_color' in self.config_data:
|
||||||
if status in self.config_data['led_color']:
|
if status in self.config_data['led_color']:
|
||||||
return self.config_data['led_color'][status]
|
return self.config_data['led_color'][status]
|
||||||
|
|
||||||
return self.DEFAULT_LED_CONFIG[status]
|
return self.DEFAULT_LED_CONFIG[status]
|
||||||
|
|
||||||
def get_bootup_timeout(self):
|
def get_bootup_timeout(self):
|
||||||
"""
|
"""
|
||||||
Get boot up timeout from monit configuration file.
|
Get boot up timeout from monit configuration file.
|
||||||
1. If monit configuration file does not exist, return default value
|
1. If monit configuration file does not exist, return default value
|
||||||
2. If there is any exception while parsing monit config, return default value
|
2. If there is any exception while parsing monit config, return default value
|
||||||
:return: Integer timeout value
|
:return: Integer timeout value
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(Config.MONIT_CONFIG_FILE):
|
if not os.path.exists(Config.MONIT_CONFIG_FILE):
|
||||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(Config.MONIT_CONFIG_FILE) as f:
|
with open(Config.MONIT_CONFIG_FILE) as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
pos = line.find('#')
|
pos = line.find('#')
|
||||||
if pos == 0:
|
if pos == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
line = line[:pos]
|
line = line[:pos]
|
||||||
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
|
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
|
||||||
if pos != -1:
|
if pos != -1:
|
||||||
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
|
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
|
||||||
except Exception:
|
except Exception:
|
||||||
return self.DEFAULT_BOOTUP_TIMEOUT
|
return self.DEFAULT_BOOTUP_TIMEOUT
|
||||||
|
|
||||||
def _get_list_data(self, key):
|
def _get_list_data(self, key):
|
||||||
"""
|
"""
|
||||||
Get list type configuration data by key and remove duplicate element.
|
Get list type configuration data by key and remove duplicate element.
|
||||||
:param key: Key of the configuration entry
|
:param key: Key of the configuration entry
|
||||||
:return: A set of configuration data if key exists
|
:return: A set of configuration data if key exists
|
||||||
"""
|
"""
|
||||||
if key in self.config_data:
|
if key in self.config_data:
|
||||||
data = self.config_data[key]
|
data = self.config_data[key]
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
return set(data)
|
return set(data)
|
||||||
return None
|
return None
|
||||||
|
@ -1,248 +1,248 @@
|
|||||||
from natsort import natsorted
|
from natsort import natsorted
|
||||||
from swsssdk import SonicV2Connector
|
from swsssdk import SonicV2Connector
|
||||||
|
|
||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
|
|
||||||
|
|
||||||
class HardwareChecker(HealthChecker):
|
class HardwareChecker(HealthChecker):
|
||||||
"""
|
"""
|
||||||
Check system hardware status. For now, it checks ASIC, PSU and fan status.
|
Check system hardware status. For now, it checks ASIC, PSU and fan status.
|
||||||
"""
|
"""
|
||||||
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
|
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
|
||||||
FAN_TABLE_NAME = 'FAN_INFO'
|
FAN_TABLE_NAME = 'FAN_INFO'
|
||||||
PSU_TABLE_NAME = 'PSU_INFO'
|
PSU_TABLE_NAME = 'PSU_INFO'
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HealthChecker.__init__(self)
|
HealthChecker.__init__(self)
|
||||||
self._db = SonicV2Connector(host="127.0.0.1")
|
self._db = SonicV2Connector(host="127.0.0.1")
|
||||||
self._db.connect(self._db.STATE_DB)
|
self._db.connect(self._db.STATE_DB)
|
||||||
|
|
||||||
def get_category(self):
|
def get_category(self):
|
||||||
return 'Hardware'
|
return 'Hardware'
|
||||||
|
|
||||||
def check(self, config):
|
def check(self, config):
|
||||||
self.reset()
|
self.reset()
|
||||||
self._check_asic_status(config)
|
self._check_asic_status(config)
|
||||||
self._check_fan_status(config)
|
self._check_fan_status(config)
|
||||||
self._check_psu_status(config)
|
self._check_psu_status(config)
|
||||||
|
|
||||||
def _check_asic_status(self, config):
|
def _check_asic_status(self, config):
|
||||||
"""
|
"""
|
||||||
Check if ASIC temperature is in valid range.
|
Check if ASIC temperature is in valid range.
|
||||||
:param config: Health checker configuration
|
:param config: Health checker configuration
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if config.ignore_devices and 'asic' in config.ignore_devices:
|
if config.ignore_devices and 'asic' in config.ignore_devices:
|
||||||
return
|
return
|
||||||
|
|
||||||
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
|
temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature')
|
||||||
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
|
temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold')
|
||||||
if not temperature:
|
if not temperature:
|
||||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
|
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature')
|
||||||
elif not temperature_threshold:
|
elif not temperature_threshold:
|
||||||
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
|
self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold')
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
temperature = float(temperature)
|
temperature = float(temperature)
|
||||||
temperature_threshold = float(temperature_threshold)
|
temperature_threshold = float(temperature_threshold)
|
||||||
if temperature > temperature_threshold:
|
if temperature > temperature_threshold:
|
||||||
self.set_object_not_ok('ASIC', 'ASIC',
|
self.set_object_not_ok('ASIC', 'ASIC',
|
||||||
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
|
'ASIC temperature is too hot, temperature={}, threshold={}'.format(
|
||||||
temperature,
|
temperature,
|
||||||
temperature_threshold))
|
temperature_threshold))
|
||||||
else:
|
else:
|
||||||
self.set_object_ok('ASIC', 'ASIC')
|
self.set_object_ok('ASIC', 'ASIC')
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
self.set_object_not_ok('ASIC', 'ASIC',
|
self.set_object_not_ok('ASIC', 'ASIC',
|
||||||
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
|
'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature,
|
||||||
temperature_threshold))
|
temperature_threshold))
|
||||||
|
|
||||||
def _check_fan_status(self, config):
|
def _check_fan_status(self, config):
|
||||||
"""
|
"""
|
||||||
Check fan status including:
|
Check fan status including:
|
||||||
1. Check all fans are present
|
1. Check all fans are present
|
||||||
2. Check all fans are in good state
|
2. Check all fans are in good state
|
||||||
3. Check fan speed is in valid range
|
3. Check fan speed is in valid range
|
||||||
:param config: Health checker configuration
|
:param config: Health checker configuration
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if config.ignore_devices and 'fan' in config.ignore_devices:
|
if config.ignore_devices and 'fan' in config.ignore_devices:
|
||||||
return
|
return
|
||||||
|
|
||||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
|
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*')
|
||||||
if not keys:
|
if not keys:
|
||||||
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
|
self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information')
|
||||||
return
|
return
|
||||||
|
|
||||||
for key in natsorted(keys):
|
for key in natsorted(keys):
|
||||||
key_list = key.split('|')
|
key_list = key.split('|')
|
||||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||||
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
|
self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = key_list[1]
|
name = key_list[1]
|
||||||
if config.ignore_devices and name in config.ignore_devices:
|
if config.ignore_devices and name in config.ignore_devices:
|
||||||
continue
|
continue
|
||||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||||
presence = data_dict.get('presence', 'false')
|
presence = data_dict.get('presence', 'false')
|
||||||
if presence.lower() != 'true':
|
if presence.lower() != 'true':
|
||||||
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
|
self.set_object_not_ok('Fan', name, '{} is missing'.format(name))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
status = data_dict.get('status', 'false')
|
status = data_dict.get('status', 'false')
|
||||||
if status.lower() != 'true':
|
if status.lower() != 'true':
|
||||||
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
|
self.set_object_not_ok('Fan', name, '{} is broken'.format(name))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
||||||
speed = data_dict.get('speed', None)
|
speed = data_dict.get('speed', None)
|
||||||
speed_target = data_dict.get('speed_target', None)
|
speed_target = data_dict.get('speed_target', None)
|
||||||
speed_tolerance = data_dict.get('speed_tolerance', None)
|
speed_tolerance = data_dict.get('speed_tolerance', None)
|
||||||
if not speed:
|
if not speed:
|
||||||
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
elif not speed_target:
|
elif not speed_target:
|
||||||
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
elif not speed_tolerance:
|
elif not speed_tolerance:
|
||||||
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
speed = float(speed)
|
speed = float(speed)
|
||||||
speed_target = float(speed_target)
|
speed_target = float(speed_target)
|
||||||
speed_tolerance = float(speed_tolerance)
|
speed_tolerance = float(speed_tolerance)
|
||||||
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
||||||
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
||||||
if speed < speed_min_th or speed > speed_max_th:
|
if speed < speed_min_th or speed > speed_max_th:
|
||||||
self.set_object_not_ok('Fan', name,
|
self.set_object_not_ok('Fan', name,
|
||||||
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
||||||
speed,
|
speed,
|
||||||
speed_min_th,
|
speed_min_th,
|
||||||
speed_max_th))
|
speed_max_th))
|
||||||
continue
|
continue
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.set_object_not_ok('Fan', name,
|
self.set_object_not_ok('Fan', name,
|
||||||
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
||||||
name,
|
name,
|
||||||
speed,
|
speed,
|
||||||
speed_target,
|
speed_target,
|
||||||
speed_tolerance))
|
speed_tolerance))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.set_object_ok('Fan', name)
|
self.set_object_ok('Fan', name)
|
||||||
|
|
||||||
def _check_psu_status(self, config):
|
def _check_psu_status(self, config):
|
||||||
"""
|
"""
|
||||||
Check PSU status including:
|
Check PSU status including:
|
||||||
1. Check all PSUs are present
|
1. Check all PSUs are present
|
||||||
2. Check all PSUs are power on
|
2. Check all PSUs are power on
|
||||||
3. Check PSU temperature is in valid range
|
3. Check PSU temperature is in valid range
|
||||||
4. Check PSU voltage is in valid range
|
4. Check PSU voltage is in valid range
|
||||||
:param config: Health checker configuration
|
:param config: Health checker configuration
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if config.ignore_devices and 'psu' in config.ignore_devices:
|
if config.ignore_devices and 'psu' in config.ignore_devices:
|
||||||
return
|
return
|
||||||
|
|
||||||
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
|
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*')
|
||||||
if not keys:
|
if not keys:
|
||||||
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
|
self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information')
|
||||||
return
|
return
|
||||||
|
|
||||||
for key in natsorted(keys):
|
for key in natsorted(keys):
|
||||||
key_list = key.split('|')
|
key_list = key.split('|')
|
||||||
if len(key_list) != 2: # error data in DB, log it and ignore
|
if len(key_list) != 2: # error data in DB, log it and ignore
|
||||||
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
|
self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = key_list[1]
|
name = key_list[1]
|
||||||
if config.ignore_devices and name in config.ignore_devices:
|
if config.ignore_devices and name in config.ignore_devices:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
data_dict = self._db.get_all(self._db.STATE_DB, key)
|
||||||
presence = data_dict.get('presence', 'false')
|
presence = data_dict.get('presence', 'false')
|
||||||
if presence.lower() != 'true':
|
if presence.lower() != 'true':
|
||||||
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
|
self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
status = data_dict.get('status', 'false')
|
status = data_dict.get('status', 'false')
|
||||||
if status.lower() != 'true':
|
if status.lower() != 'true':
|
||||||
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
|
self.set_object_not_ok('PSU', name, '{} is out of power'.format(name))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
|
if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'):
|
||||||
temperature = data_dict.get('temp', None)
|
temperature = data_dict.get('temp', None)
|
||||||
temperature_threshold = data_dict.get('temp_threshold', None)
|
temperature_threshold = data_dict.get('temp_threshold', None)
|
||||||
if temperature is None:
|
if temperature is None:
|
||||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
|
self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
elif temperature_threshold is None:
|
elif temperature_threshold is None:
|
||||||
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
|
self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
temperature = float(temperature)
|
temperature = float(temperature)
|
||||||
temperature_threshold = float(temperature_threshold)
|
temperature_threshold = float(temperature_threshold)
|
||||||
if temperature > temperature_threshold:
|
if temperature > temperature_threshold:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'{} temperature is too hot, temperature={}, threshold={}'.format(
|
'{} temperature is too hot, temperature={}, threshold={}'.format(
|
||||||
name, temperature,
|
name, temperature,
|
||||||
temperature_threshold))
|
temperature_threshold))
|
||||||
continue
|
continue
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
|
'Invalid temperature data for {}, temperature={}, threshold={}'.format(
|
||||||
name, temperature,
|
name, temperature,
|
||||||
temperature_threshold))
|
temperature_threshold))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
|
if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'):
|
||||||
voltage = data_dict.get('voltage', None)
|
voltage = data_dict.get('voltage', None)
|
||||||
voltage_min_th = data_dict.get('voltage_min_threshold', None)
|
voltage_min_th = data_dict.get('voltage_min_threshold', None)
|
||||||
voltage_max_th = data_dict.get('voltage_max_threshold', None)
|
voltage_max_th = data_dict.get('voltage_max_threshold', None)
|
||||||
if voltage is None:
|
if voltage is None:
|
||||||
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
|
self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
elif voltage_min_th is None:
|
elif voltage_min_th is None:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'Failed to get voltage minimum threshold data for {}'.format(name))
|
'Failed to get voltage minimum threshold data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
elif voltage_max_th is None:
|
elif voltage_max_th is None:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'Failed to get voltage maximum threshold data for {}'.format(name))
|
'Failed to get voltage maximum threshold data for {}'.format(name))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
voltage = float(voltage)
|
voltage = float(voltage)
|
||||||
voltage_min_th = float(voltage_min_th)
|
voltage_min_th = float(voltage_min_th)
|
||||||
voltage_max_th = float(voltage_max_th)
|
voltage_max_th = float(voltage_max_th)
|
||||||
if voltage < voltage_min_th or voltage > voltage_max_th:
|
if voltage < voltage_min_th or voltage > voltage_max_th:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
|
'{} voltage is out of range, voltage={}, range=[{},{}]'.format(name,
|
||||||
voltage,
|
voltage,
|
||||||
voltage_min_th,
|
voltage_min_th,
|
||||||
voltage_max_th))
|
voltage_max_th))
|
||||||
continue
|
continue
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.set_object_not_ok('PSU', name,
|
self.set_object_not_ok('PSU', name,
|
||||||
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
|
'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name,
|
||||||
voltage,
|
voltage,
|
||||||
voltage_min_th,
|
voltage_min_th,
|
||||||
voltage_max_th))
|
voltage_max_th))
|
||||||
continue
|
continue
|
||||||
self.set_object_ok('PSU', name)
|
self.set_object_ok('PSU', name)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._info = {}
|
self._info = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _ignore_check(cls, ignore_set, category, object_name, check_point):
|
def _ignore_check(cls, ignore_set, category, object_name, check_point):
|
||||||
if not ignore_set:
|
if not ignore_set:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if '{}.{}'.format(category, check_point) in ignore_set:
|
if '{}.{}'.format(category, check_point) in ignore_set:
|
||||||
return True
|
return True
|
||||||
elif '{}.{}'.format(object_name, check_point) in ignore_set:
|
elif '{}.{}'.format(object_name, check_point) in ignore_set:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
@ -1,86 +1,86 @@
|
|||||||
class HealthChecker(object):
|
class HealthChecker(object):
|
||||||
"""
|
"""
|
||||||
Base class for health checker. A checker is an object that performs system health check for a particular category,
|
Base class for health checker. A checker is an object that performs system health check for a particular category,
|
||||||
it collects and stores information after the check.
|
it collects and stores information after the check.
|
||||||
"""
|
"""
|
||||||
INFO_FIELD_OBJECT_TYPE = 'type'
|
INFO_FIELD_OBJECT_TYPE = 'type'
|
||||||
INFO_FIELD_OBJECT_STATUS = 'status'
|
INFO_FIELD_OBJECT_STATUS = 'status'
|
||||||
INFO_FIELD_OBJECT_MSG = 'message'
|
INFO_FIELD_OBJECT_MSG = 'message'
|
||||||
|
|
||||||
STATUS_OK = 'OK'
|
STATUS_OK = 'OK'
|
||||||
STATUS_NOT_OK = 'Not OK'
|
STATUS_NOT_OK = 'Not OK'
|
||||||
|
|
||||||
summary = STATUS_OK
|
summary = STATUS_OK
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._info = {}
|
self._info = {}
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""
|
"""
|
||||||
Reset the status of the checker. Called every time before the check.
|
Reset the status of the checker. Called every time before the check.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_category(self):
|
def get_category(self):
|
||||||
"""
|
"""
|
||||||
Get category of the checker.
|
Get category of the checker.
|
||||||
:return: String category
|
:return: String category
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
"""
|
"""
|
||||||
Get information of the checker. A checker usually checks a few objects and each object status will be put to
|
Get information of the checker. A checker usually checks a few objects and each object status will be put to
|
||||||
self._info.
|
self._info.
|
||||||
:return: Check result.
|
:return: Check result.
|
||||||
"""
|
"""
|
||||||
return self._info
|
return self._info
|
||||||
|
|
||||||
def check(self, config):
|
def check(self, config):
|
||||||
"""
|
"""
|
||||||
Perform the check.
|
Perform the check.
|
||||||
:param config: Health checker configuration.
|
:param config: Health checker configuration.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
|
||||||
def add_info(self, object_name, key, value):
|
def add_info(self, object_name, key, value):
|
||||||
"""
|
"""
|
||||||
Add check result for an object.
|
Add check result for an object.
|
||||||
:param object_name: Object name.
|
:param object_name: Object name.
|
||||||
:param key: Object attribute name.
|
:param key: Object attribute name.
|
||||||
:param value: Object attribute value.
|
:param value: Object attribute value.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if object_name not in self._info:
|
if object_name not in self._info:
|
||||||
self._info[object_name] = {}
|
self._info[object_name] = {}
|
||||||
|
|
||||||
self._info[object_name][key] = value
|
self._info[object_name][key] = value
|
||||||
|
|
||||||
def set_object_not_ok(self, object_type, object_name, message):
|
def set_object_not_ok(self, object_type, object_name, message):
|
||||||
"""
|
"""
|
||||||
Set that an object is not OK.
|
Set that an object is not OK.
|
||||||
:param object_type: Object type.
|
:param object_type: Object type.
|
||||||
:param object_name: Object name.
|
:param object_name: Object name.
|
||||||
:param message: A message to describe what is wrong with the object.
|
:param message: A message to describe what is wrong with the object.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message)
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK)
|
||||||
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
|
||||||
|
|
||||||
def set_object_ok(self, object_type, object_name):
|
def set_object_ok(self, object_type, object_name):
|
||||||
"""
|
"""
|
||||||
Set that an object is in good state.
|
Set that an object is in good state.
|
||||||
:param object_type: Object type.
|
:param object_type: Object type.
|
||||||
:param object_name: Object name.
|
:param object_name: Object name.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type)
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '')
|
||||||
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)
|
self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK)
|
||||||
|
@ -1,101 +1,101 @@
|
|||||||
class HealthCheckerManager(object):
|
class HealthCheckerManager(object):
|
||||||
"""
|
"""
|
||||||
Manage all system health checkers and system health configuration.
|
Manage all system health checkers and system health configuration.
|
||||||
"""
|
"""
|
||||||
STATE_BOOTING = 'booting'
|
STATE_BOOTING = 'booting'
|
||||||
STATE_RUNNING = 'running'
|
STATE_RUNNING = 'running'
|
||||||
boot_timeout = None
|
boot_timeout = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._checkers = []
|
self._checkers = []
|
||||||
self._state = self.STATE_BOOTING
|
self._state = self.STATE_BOOTING
|
||||||
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
self.config = Config()
|
self.config = Config()
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
"""
|
"""
|
||||||
Initialize the manager. Create service checker and hardware checker by default.
|
Initialize the manager. Create service checker and hardware checker by default.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
from .service_checker import ServiceChecker
|
from .service_checker import ServiceChecker
|
||||||
from .hardware_checker import HardwareChecker
|
from .hardware_checker import HardwareChecker
|
||||||
self._checkers.append(ServiceChecker())
|
self._checkers.append(ServiceChecker())
|
||||||
self._checkers.append(HardwareChecker())
|
self._checkers.append(HardwareChecker())
|
||||||
|
|
||||||
def check(self, chassis):
|
def check(self, chassis):
|
||||||
"""
|
"""
|
||||||
Load new configuration if any and perform the system health check for all existing checkers.
|
Load new configuration if any and perform the system health check for all existing checkers.
|
||||||
:param chassis: A chassis object.
|
:param chassis: A chassis object.
|
||||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||||
contains the status for all objects that was checked.
|
contains the status for all objects that was checked.
|
||||||
"""
|
"""
|
||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||||
stats = {}
|
stats = {}
|
||||||
self.config.load_config()
|
self.config.load_config()
|
||||||
# check state first to avoid user change boot timeout in configuration file
|
# check state first to avoid user change boot timeout in configuration file
|
||||||
# after finishing system boot
|
# after finishing system boot
|
||||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
||||||
self._set_system_led(chassis, self.config, 'booting')
|
self._set_system_led(chassis, self.config, 'booting')
|
||||||
return self._state, stats
|
return self._state, stats
|
||||||
|
|
||||||
for checker in self._checkers:
|
for checker in self._checkers:
|
||||||
self._do_check(checker, stats)
|
self._do_check(checker, stats)
|
||||||
|
|
||||||
if self.config.user_defined_checkers:
|
if self.config.user_defined_checkers:
|
||||||
from .user_defined_checker import UserDefinedChecker
|
from .user_defined_checker import UserDefinedChecker
|
||||||
for udc in self.config.user_defined_checkers:
|
for udc in self.config.user_defined_checkers:
|
||||||
checker = UserDefinedChecker(udc)
|
checker = UserDefinedChecker(udc)
|
||||||
self._do_check(checker, stats)
|
self._do_check(checker, stats)
|
||||||
|
|
||||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||||
self._set_system_led(chassis, self.config, led_status)
|
self._set_system_led(chassis, self.config, led_status)
|
||||||
|
|
||||||
return self._state, stats
|
return self._state, stats
|
||||||
|
|
||||||
def _do_check(self, checker, stats):
|
def _do_check(self, checker, stats):
|
||||||
"""
|
"""
|
||||||
Do check for a particular checker and collect the check statistic.
|
Do check for a particular checker and collect the check statistic.
|
||||||
:param checker: A checker object.
|
:param checker: A checker object.
|
||||||
:param stats: Check statistic.
|
:param stats: Check statistic.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
checker.check(self.config)
|
checker.check(self.config)
|
||||||
category = checker.get_category()
|
category = checker.get_category()
|
||||||
info = checker.get_info()
|
info = checker.get_info()
|
||||||
if category not in stats:
|
if category not in stats:
|
||||||
stats[category] = info
|
stats[category] = info
|
||||||
else:
|
else:
|
||||||
stats[category].update(info)
|
stats[category].update(info)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
|
||||||
entry = {str(checker): {
|
entry = {str(checker): {
|
||||||
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
|
||||||
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
|
||||||
}}
|
}}
|
||||||
if 'Internal' not in stats:
|
if 'Internal' not in stats:
|
||||||
stats['Internal'] = entry
|
stats['Internal'] = entry
|
||||||
else:
|
else:
|
||||||
stats['Internal'].update(entry)
|
stats['Internal'].update(entry)
|
||||||
|
|
||||||
def _is_system_booting(self):
|
def _is_system_booting(self):
|
||||||
from .utils import get_uptime
|
from .utils import get_uptime
|
||||||
uptime = get_uptime()
|
uptime = get_uptime()
|
||||||
if not self.boot_timeout:
|
if not self.boot_timeout:
|
||||||
self.boot_timeout = self.config.get_bootup_timeout()
|
self.boot_timeout = self.config.get_bootup_timeout()
|
||||||
booting = uptime < self.boot_timeout
|
booting = uptime < self.boot_timeout
|
||||||
if not booting:
|
if not booting:
|
||||||
self._state = self.STATE_RUNNING
|
self._state = self.STATE_RUNNING
|
||||||
return booting
|
return booting
|
||||||
|
|
||||||
def _set_system_led(self, chassis, config, status):
|
def _set_system_led(self, chassis, config, status):
|
||||||
try:
|
try:
|
||||||
chassis.set_status_led(config.get_led_color(status))
|
chassis.set_status_led(config.get_led_color(status))
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
print('chassis.set_status_led is not implemented')
|
print('chassis.set_status_led is not implemented')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('Failed to set system led due to - {}'.format(repr(e)))
|
print('Failed to set system led due to - {}'.format(repr(e)))
|
||||||
|
@ -1,72 +1,72 @@
|
|||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
|
|
||||||
class ServiceChecker(HealthChecker):
|
class ServiceChecker(HealthChecker):
|
||||||
"""
|
"""
|
||||||
Checker that checks critical system service status via monit service.
|
Checker that checks critical system service status via monit service.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Command to query the status of monit service.
|
# Command to query the status of monit service.
|
||||||
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'
|
||||||
|
|
||||||
# Command to get summary of critical system service.
|
# Command to get summary of critical system service.
|
||||||
CHECK_CMD = 'monit summary -B'
|
CHECK_CMD = 'monit summary -B'
|
||||||
MIN_CHECK_CMD_LINES = 3
|
MIN_CHECK_CMD_LINES = 3
|
||||||
|
|
||||||
# Expect status for different system service category.
|
# Expect status for different system service category.
|
||||||
EXPECT_STATUS_DICT = {
|
EXPECT_STATUS_DICT = {
|
||||||
'System': 'Running',
|
'System': 'Running',
|
||||||
'Process': 'Running',
|
'Process': 'Running',
|
||||||
'Filesystem': 'Accessible',
|
'Filesystem': 'Accessible',
|
||||||
'Program': 'Status ok'
|
'Program': 'Status ok'
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HealthChecker.__init__(self)
|
HealthChecker.__init__(self)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._info = {}
|
self._info = {}
|
||||||
|
|
||||||
def get_category(self):
|
def get_category(self):
|
||||||
return 'Services'
|
return 'Services'
|
||||||
|
|
||||||
def check(self, config):
|
def check(self, config):
|
||||||
"""
|
"""
|
||||||
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
|
||||||
process and file system.
|
process and file system.
|
||||||
:param config: Health checker configuration.
|
:param config: Health checker configuration.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self.reset()
|
self.reset()
|
||||||
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
|
||||||
if output != 'active':
|
if output != 'active':
|
||||||
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
|
||||||
return
|
return
|
||||||
|
|
||||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||||
lines = output.splitlines()
|
lines = output.splitlines()
|
||||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||||
return
|
return
|
||||||
|
|
||||||
status_begin = lines[1].find('Status')
|
status_begin = lines[1].find('Status')
|
||||||
type_begin = lines[1].find('Type')
|
type_begin = lines[1].find('Type')
|
||||||
if status_begin < 0 or type_begin < 0:
|
if status_begin < 0 or type_begin < 0:
|
||||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||||
return
|
return
|
||||||
|
|
||||||
for line in lines[2:]:
|
for line in lines[2:]:
|
||||||
name = line[0:status_begin].strip()
|
name = line[0:status_begin].strip()
|
||||||
if config.ignore_services and name in config.ignore_services:
|
if config.ignore_services and name in config.ignore_services:
|
||||||
continue
|
continue
|
||||||
status = line[status_begin:type_begin].strip()
|
status = line[status_begin:type_begin].strip()
|
||||||
service_type = line[type_begin:].strip()
|
service_type = line[type_begin:].strip()
|
||||||
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
if service_type not in ServiceChecker.EXPECT_STATUS_DICT:
|
||||||
continue
|
continue
|
||||||
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type]
|
||||||
if expect_status != status:
|
if expect_status != status:
|
||||||
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status))
|
||||||
else:
|
else:
|
||||||
self.set_object_ok(service_type, name)
|
self.set_object_ok(service_type, name)
|
||||||
return
|
return
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"services_to_ignore": [],
|
"services_to_ignore": [],
|
||||||
"devices_to_ignore": [],
|
"devices_to_ignore": [],
|
||||||
"user_defined_checkers": [],
|
"user_defined_checkers": [],
|
||||||
"polling_interval": 60,
|
"polling_interval": 60,
|
||||||
"led_color": {
|
"led_color": {
|
||||||
"fault": "amber",
|
"fault": "amber",
|
||||||
"normal": "green",
|
"normal": "green",
|
||||||
"booting": "orange_blink"
|
"booting": "orange_blink"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,88 +1,89 @@
|
|||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
|
|
||||||
class UserDefinedChecker(HealthChecker):
|
class UserDefinedChecker(HealthChecker):
|
||||||
"""
|
"""
|
||||||
User could implement a script or program to perform customize check for particular system. In order to enable a
|
User could implement a script or program to perform customize check for particular system. In order to enable a
|
||||||
user defined checker:
|
user defined checker:
|
||||||
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
|
1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string
|
||||||
that can be executed by shell. For example: "python my_checker.py".
|
that can be executed by shell. For example: "python my_checker.py".
|
||||||
2. The command output must match the following pattern:
|
2. The command output must match the following pattern:
|
||||||
${UserDefineCategory}
|
${UserDefineCategory}
|
||||||
${Object1}:${ObjectStatusMessage1}
|
${Object1}:${ObjectStatusMessage1}
|
||||||
${Object2}:${ObjectStatusMessage2}
|
${Object2}:${ObjectStatusMessage2}
|
||||||
|
|
||||||
An example of the command output:
|
An example of the command output:
|
||||||
MyCategory
|
MyCategory
|
||||||
Device1:OK
|
Device1:OK
|
||||||
Device2:OK
|
Device2:OK
|
||||||
Device3:Out of power
|
Device3:Out of power
|
||||||
"""
|
"""
|
||||||
def __init__(self, cmd):
|
|
||||||
"""
|
def __init__(self, cmd):
|
||||||
Constructor.
|
"""
|
||||||
:param cmd: Command string of the user defined checker.
|
Constructor.
|
||||||
"""
|
:param cmd: Command string of the user defined checker.
|
||||||
HealthChecker.__init__(self)
|
"""
|
||||||
self._cmd = cmd
|
HealthChecker.__init__(self)
|
||||||
self._category = None
|
self._cmd = cmd
|
||||||
|
self._category = None
|
||||||
def reset(self):
|
|
||||||
self._category = 'UserDefine'
|
def reset(self):
|
||||||
self._info = {}
|
self._category = 'UserDefine'
|
||||||
|
self._info = {}
|
||||||
def get_category(self):
|
|
||||||
return self._category
|
def get_category(self):
|
||||||
|
return self._category
|
||||||
def check(self, config):
|
|
||||||
"""
|
def check(self, config):
|
||||||
Execute the user defined command and parse the output.
|
"""
|
||||||
:param config: Health checker configuration.
|
Execute the user defined command and parse the output.
|
||||||
:return:
|
:param config: Health checker configuration.
|
||||||
"""
|
:return:
|
||||||
self.reset()
|
"""
|
||||||
|
self.reset()
|
||||||
output = utils.run_command(self._cmd)
|
|
||||||
if not output:
|
output = utils.run_command(self._cmd)
|
||||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
if not output:
|
||||||
return
|
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||||
|
return
|
||||||
output = output.strip()
|
|
||||||
if not output:
|
output = output.strip()
|
||||||
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
if not output:
|
||||||
return
|
self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd))
|
||||||
|
return
|
||||||
raw_lines = output.splitlines()
|
|
||||||
if not raw_lines:
|
raw_lines = output.splitlines()
|
||||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
if not raw_lines:
|
||||||
return
|
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||||
|
return
|
||||||
lines = []
|
|
||||||
for line in raw_lines:
|
lines = []
|
||||||
line = line.strip()
|
for line in raw_lines:
|
||||||
if not line:
|
line = line.strip()
|
||||||
continue
|
if not line:
|
||||||
|
continue
|
||||||
lines.append(line)
|
|
||||||
|
lines.append(line)
|
||||||
if not lines:
|
|
||||||
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
if not lines:
|
||||||
return
|
self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd))
|
||||||
|
return
|
||||||
self._category = lines[0]
|
|
||||||
if len(lines) > 1:
|
self._category = lines[0]
|
||||||
for line in lines[1:]:
|
if len(lines) > 1:
|
||||||
pos = line.find(':')
|
for line in lines[1:]:
|
||||||
if pos == -1:
|
pos = line.find(':')
|
||||||
continue
|
if pos == -1:
|
||||||
obj_name = line[:pos].strip()
|
continue
|
||||||
msg = line[pos + 1:].strip()
|
obj_name = line[:pos].strip()
|
||||||
if msg != 'OK':
|
msg = line[pos + 1:].strip()
|
||||||
self.set_object_not_ok('UserDefine', obj_name, msg)
|
if msg != 'OK':
|
||||||
else:
|
self.set_object_not_ok('UserDefine', obj_name, msg)
|
||||||
self.set_object_ok('UserDefine', obj_name)
|
else:
|
||||||
return
|
self.set_object_ok('UserDefine', obj_name)
|
||||||
|
return
|
||||||
def __str__(self):
|
|
||||||
return 'UserDefinedChecker - {}'.format(self._cmd)
|
def __str__(self):
|
||||||
|
return 'UserDefinedChecker - {}'.format(self._cmd)
|
||||||
|
@ -1,25 +1,25 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
def run_command(command):
|
def run_command(command):
|
||||||
"""
|
"""
|
||||||
Utility function to run an shell command and return the output.
|
Utility function to run an shell command and return the output.
|
||||||
:param command: Shell command string.
|
:param command: Shell command string.
|
||||||
:return: Output of the shell command.
|
:return: Output of the shell command.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||||
return process.communicate()[0].encode('utf-8')
|
return process.communicate()[0].encode('utf-8')
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_uptime():
|
def get_uptime():
|
||||||
"""
|
"""
|
||||||
Utility to get the system up time.
|
Utility to get the system up time.
|
||||||
:return: System up time in seconds.
|
:return: System up time in seconds.
|
||||||
"""
|
"""
|
||||||
with open('/proc/uptime', 'r') as f:
|
with open('/proc/uptime', 'r') as f:
|
||||||
uptime_seconds = float(f.readline().split()[0])
|
uptime_seconds = float(f.readline().split()[0])
|
||||||
|
|
||||||
return uptime_seconds
|
return uptime_seconds
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
healthd
|
healthd
|
||||||
|
@ -24,10 +24,10 @@ setup(
|
|||||||
scripts=[
|
scripts=[
|
||||||
'scripts/healthd',
|
'scripts/healthd',
|
||||||
],
|
],
|
||||||
setup_requires= [
|
setup_requires=[
|
||||||
'pytest-runner'
|
'pytest-runner'
|
||||||
],
|
],
|
||||||
tests_require = [
|
tests_require=[
|
||||||
'pytest',
|
'pytest',
|
||||||
'mock>=2.0.0'
|
'mock>=2.0.0'
|
||||||
],
|
],
|
||||||
@ -40,10 +40,9 @@ setup(
|
|||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
'Natural Language :: English',
|
'Natural Language :: English',
|
||||||
'Operating System :: POSIX :: Linux',
|
'Operating System :: POSIX :: Linux',
|
||||||
'Programming Language :: Python :: 2.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
'Topic :: System :: Hardware',
|
'Topic :: System :: Hardware',
|
||||||
],
|
],
|
||||||
keywords='SONiC sonic HEALTH health',
|
keywords='SONiC sonic HEALTH health',
|
||||||
test_suite='setup.get_test_suite'
|
test_suite='setup.get_test_suite'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -22,4 +22,3 @@ class MockConnector(object):
|
|||||||
|
|
||||||
def get_all(self, db_id, key):
|
def get_all(self, db_id, key):
|
||||||
return MockConnector.data[key]
|
return MockConnector.data[key]
|
||||||
|
|
||||||
|
@ -73,8 +73,8 @@ def test_service_checker():
|
|||||||
'telemetry Does not exist Process\n' \
|
'telemetry Does not exist Process\n' \
|
||||||
'orchagent Running Process\n' \
|
'orchagent Running Process\n' \
|
||||||
'root-overlay Accessible Filesystem\n' \
|
'root-overlay Accessible Filesystem\n' \
|
||||||
'var-log Is not accessible Filesystem\n'
|
'var-log Is not accessible Filesystem\n'
|
||||||
|
|
||||||
checker = ServiceChecker()
|
checker = ServiceChecker()
|
||||||
config = Config()
|
config = Config()
|
||||||
checker.check(config)
|
checker.check(config)
|
||||||
|
Reference in New Issue
Block a user