Make system health service start early (#9792)
- Why I did it For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY". - How I did it 1. Remove booting stage 2. Adjust unit test cases - How to verify it Manual test, Unit test, sonic-mgmt Regression
This commit is contained in:
parent
43e967d6a4
commit
c06cb219e2
@ -1,4 +1,3 @@
|
|||||||
from . import utils
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .health_checker import HealthChecker
|
from .health_checker import HealthChecker
|
||||||
from .service_checker import ServiceChecker
|
from .service_checker import ServiceChecker
|
||||||
@ -10,14 +9,10 @@ class HealthCheckerManager(object):
|
|||||||
"""
|
"""
|
||||||
Manage all system health checkers and system health configuration.
|
Manage all system health checkers and system health configuration.
|
||||||
"""
|
"""
|
||||||
STATE_BOOTING = 'booting'
|
|
||||||
STATE_RUNNING = 'running'
|
|
||||||
boot_timeout = None
|
boot_timeout = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._checkers = []
|
self._checkers = []
|
||||||
self._state = self.STATE_BOOTING
|
|
||||||
|
|
||||||
self.config = Config()
|
self.config = Config()
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
@ -33,17 +28,11 @@ class HealthCheckerManager(object):
|
|||||||
"""
|
"""
|
||||||
Load new configuration if any and perform the system health check for all existing checkers.
|
Load new configuration if any and perform the system health check for all existing checkers.
|
||||||
:param chassis: A chassis object.
|
:param chassis: A chassis object.
|
||||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
:return: A dictionary that contains the status for all objects that was checked.
|
||||||
contains the status for all objects that was checked.
|
|
||||||
"""
|
"""
|
||||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||||
stats = {}
|
stats = {}
|
||||||
self.config.load_config()
|
self.config.load_config()
|
||||||
# check state first to avoid user change boot timeout in configuration file
|
|
||||||
# after finishing system boot
|
|
||||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
|
||||||
self._set_system_led(chassis, self.config, 'booting')
|
|
||||||
return self._state, stats
|
|
||||||
|
|
||||||
for checker in self._checkers:
|
for checker in self._checkers:
|
||||||
self._do_check(checker, stats)
|
self._do_check(checker, stats)
|
||||||
@ -56,7 +45,7 @@ class HealthCheckerManager(object):
|
|||||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||||
self._set_system_led(chassis, self.config, led_status)
|
self._set_system_led(chassis, self.config, led_status)
|
||||||
|
|
||||||
return self._state, stats
|
return stats
|
||||||
|
|
||||||
def _do_check(self, checker, stats):
|
def _do_check(self, checker, stats):
|
||||||
"""
|
"""
|
||||||
@ -86,15 +75,6 @@ class HealthCheckerManager(object):
|
|||||||
else:
|
else:
|
||||||
stats['Internal'].update(entry)
|
stats['Internal'].update(entry)
|
||||||
|
|
||||||
def _is_system_booting(self):
|
|
||||||
uptime = utils.get_uptime()
|
|
||||||
if not self.boot_timeout:
|
|
||||||
self.boot_timeout = self.config.get_bootup_timeout()
|
|
||||||
booting = uptime < self.boot_timeout
|
|
||||||
if not booting:
|
|
||||||
self._state = self.STATE_RUNNING
|
|
||||||
return booting
|
|
||||||
|
|
||||||
def _set_system_led(self, chassis, config, status):
|
def _set_system_led(self, chassis, config, status):
|
||||||
try:
|
try:
|
||||||
chassis.set_status_led(config.get_led_color(status))
|
chassis.set_status_led(config.get_led_color(status))
|
||||||
|
@ -218,7 +218,7 @@ class ServiceChecker(HealthChecker):
|
|||||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||||
lines = output.splitlines()
|
lines = output.splitlines()
|
||||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
|
||||||
return
|
return
|
||||||
|
|
||||||
status_begin = lines[1].find('Status')
|
status_begin = lines[1].find('Status')
|
||||||
|
@ -8,7 +8,7 @@ def run_command(command):
|
|||||||
:return: Output of the shell command.
|
:return: Output of the shell command.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
|
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
return process.communicate()[0]
|
return process.communicate()[0]
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'
|
|||||||
|
|
||||||
class HealthDaemon(DaemonBase):
|
class HealthDaemon(DaemonBase):
|
||||||
"""
|
"""
|
||||||
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
|
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
|
||||||
according to the check result and store the check result to redis.
|
according to the check result and store the check result to redis.
|
||||||
"""
|
"""
|
||||||
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
|
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
|
||||||
@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
|
|||||||
def deinit(self):
|
def deinit(self):
|
||||||
"""
|
"""
|
||||||
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
|
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self._clear_system_health_table()
|
self._clear_system_health_table()
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
|
|||||||
def run(self):
|
def run(self):
|
||||||
"""
|
"""
|
||||||
Check system health in an infinite loop.
|
Check system health in an infinite loop.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
self.log_notice("Starting up...")
|
self.log_notice("Starting up...")
|
||||||
|
|
||||||
@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
|
|||||||
self.log_warning("System health configuration file not found, exit...")
|
self.log_warning("System health configuration file not found, exit...")
|
||||||
return
|
return
|
||||||
while 1:
|
while 1:
|
||||||
state, stat = manager.check(chassis)
|
stat = manager.check(chassis)
|
||||||
if state == HealthCheckerManager.STATE_RUNNING:
|
self._process_stat(chassis, manager.config, stat)
|
||||||
self._process_stat(chassis, manager.config, stat)
|
|
||||||
|
|
||||||
if self.stop_event.wait(manager.config.interval):
|
if self.stop_event.wait(manager.config.interval):
|
||||||
break
|
break
|
||||||
|
@ -439,26 +439,14 @@ def test_config():
|
|||||||
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
|
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
|
||||||
@patch('health_checker.service_checker.ServiceChecker.get_info')
|
@patch('health_checker.service_checker.ServiceChecker.get_info')
|
||||||
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
|
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
|
||||||
@patch('health_checker.utils.get_uptime')
|
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
|
||||||
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
|
||||||
chassis = MagicMock()
|
chassis = MagicMock()
|
||||||
chassis.set_status_led = MagicMock()
|
chassis.set_status_led = MagicMock()
|
||||||
|
|
||||||
manager = HealthCheckerManager()
|
manager = HealthCheckerManager()
|
||||||
manager.config.user_defined_checkers = ['some check']
|
manager.config.user_defined_checkers = ['some check']
|
||||||
assert manager._state == HealthCheckerManager.STATE_BOOTING
|
|
||||||
assert len(manager._checkers) == 2
|
assert len(manager._checkers) == 2
|
||||||
|
|
||||||
mock_uptime.return_value = 200
|
|
||||||
assert manager._is_system_booting()
|
|
||||||
state, stat = manager.check(chassis)
|
|
||||||
assert state == HealthCheckerManager.STATE_BOOTING
|
|
||||||
assert len(stat) == 0
|
|
||||||
chassis.set_status_led.assert_called_with('orange_blink')
|
|
||||||
|
|
||||||
mock_uptime.return_value = 500
|
|
||||||
assert not manager._is_system_booting()
|
|
||||||
assert manager._state == HealthCheckerManager.STATE_RUNNING
|
|
||||||
mock_hw_info.return_value = {
|
mock_hw_info.return_value = {
|
||||||
'ASIC': {
|
'ASIC': {
|
||||||
'type': 'ASIC',
|
'type': 'ASIC',
|
||||||
@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
|||||||
'status': 'OK'
|
'status': 'OK'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
state, stat = manager.check(chassis)
|
stat = manager.check(chassis)
|
||||||
assert state == HealthCheckerManager.STATE_RUNNING
|
|
||||||
assert 'Services' in stat
|
assert 'Services' in stat
|
||||||
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
|
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
|
||||||
|
|
||||||
@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
|||||||
mock_hw_info.side_effect = RuntimeError()
|
mock_hw_info.side_effect = RuntimeError()
|
||||||
mock_service_info.side_effect = RuntimeError()
|
mock_service_info.side_effect = RuntimeError()
|
||||||
mock_udc_info.side_effect = RuntimeError()
|
mock_udc_info.side_effect = RuntimeError()
|
||||||
state, stat = manager.check(chassis)
|
stat = manager.check(chassis)
|
||||||
assert 'Internal' in stat
|
assert 'Internal' in stat
|
||||||
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
|
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
|
||||||
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
|
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
|
||||||
@ -518,6 +505,3 @@ def test_utils():
|
|||||||
|
|
||||||
output = utils.run_command('ls')
|
output = utils.run_command('ls')
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
uptime = utils.get_uptime()
|
|
||||||
assert uptime > 0
|
|
||||||
|
Reference in New Issue
Block a user