Make system health service start early (#9792)
- Why I did it For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY". - How I did it 1. Remove booting stage 2. Adjust unit test cases - How to verify it Manual test, Unit test, sonic-mgmt Regression
This commit is contained in:
parent
43e967d6a4
commit
c06cb219e2
@ -1,4 +1,3 @@
|
||||
from . import utils
|
||||
from .config import Config
|
||||
from .health_checker import HealthChecker
|
||||
from .service_checker import ServiceChecker
|
||||
@ -10,14 +9,10 @@ class HealthCheckerManager(object):
|
||||
"""
|
||||
Manage all system health checkers and system health configuration.
|
||||
"""
|
||||
STATE_BOOTING = 'booting'
|
||||
STATE_RUNNING = 'running'
|
||||
boot_timeout = None
|
||||
|
||||
def __init__(self):
|
||||
self._checkers = []
|
||||
self._state = self.STATE_BOOTING
|
||||
|
||||
self.config = Config()
|
||||
self.initialize()
|
||||
|
||||
@ -33,17 +28,11 @@ class HealthCheckerManager(object):
|
||||
"""
|
||||
Load new configuration if any and perform the system health check for all existing checkers.
|
||||
:param chassis: A chassis object.
|
||||
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
|
||||
contains the status for all objects that was checked.
|
||||
:return: A dictionary that contains the status for all objects that was checked.
|
||||
"""
|
||||
HealthChecker.summary = HealthChecker.STATUS_OK
|
||||
stats = {}
|
||||
self.config.load_config()
|
||||
# check state first to avoid user change boot timeout in configuration file
|
||||
# after finishing system boot
|
||||
if self._state == self.STATE_BOOTING and self._is_system_booting():
|
||||
self._set_system_led(chassis, self.config, 'booting')
|
||||
return self._state, stats
|
||||
|
||||
for checker in self._checkers:
|
||||
self._do_check(checker, stats)
|
||||
@ -56,7 +45,7 @@ class HealthCheckerManager(object):
|
||||
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
|
||||
self._set_system_led(chassis, self.config, led_status)
|
||||
|
||||
return self._state, stats
|
||||
return stats
|
||||
|
||||
def _do_check(self, checker, stats):
|
||||
"""
|
||||
@ -86,15 +75,6 @@ class HealthCheckerManager(object):
|
||||
else:
|
||||
stats['Internal'].update(entry)
|
||||
|
||||
def _is_system_booting(self):
|
||||
uptime = utils.get_uptime()
|
||||
if not self.boot_timeout:
|
||||
self.boot_timeout = self.config.get_bootup_timeout()
|
||||
booting = uptime < self.boot_timeout
|
||||
if not booting:
|
||||
self._state = self.STATE_RUNNING
|
||||
return booting
|
||||
|
||||
def _set_system_led(self, chassis, config, status):
|
||||
try:
|
||||
chassis.set_status_led(config.get_led_color(status))
|
||||
|
@ -218,7 +218,7 @@ class ServiceChecker(HealthChecker):
|
||||
output = utils.run_command(ServiceChecker.CHECK_CMD)
|
||||
lines = output.splitlines()
|
||||
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
|
||||
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
|
||||
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
|
||||
return
|
||||
|
||||
status_begin = lines[1].find('Status')
|
||||
|
@ -8,7 +8,7 @@ def run_command(command):
|
||||
:return: Output of the shell command.
|
||||
"""
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
|
||||
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
return process.communicate()[0]
|
||||
except Exception:
|
||||
return None
|
||||
|
@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'
|
||||
|
||||
class HealthDaemon(DaemonBase):
|
||||
"""
|
||||
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
|
||||
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
|
||||
according to the check result and store the check result to redis.
|
||||
"""
|
||||
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
|
||||
@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
|
||||
def deinit(self):
|
||||
"""
|
||||
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
|
||||
:return:
|
||||
:return:
|
||||
"""
|
||||
self._clear_system_health_table()
|
||||
|
||||
@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
|
||||
def run(self):
|
||||
"""
|
||||
Check system health in an infinite loop.
|
||||
:return:
|
||||
:return:
|
||||
"""
|
||||
self.log_notice("Starting up...")
|
||||
|
||||
@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
|
||||
self.log_warning("System health configuration file not found, exit...")
|
||||
return
|
||||
while 1:
|
||||
state, stat = manager.check(chassis)
|
||||
if state == HealthCheckerManager.STATE_RUNNING:
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
stat = manager.check(chassis)
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
|
||||
if self.stop_event.wait(manager.config.interval):
|
||||
break
|
||||
|
@ -439,26 +439,14 @@ def test_config():
|
||||
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
|
||||
@patch('health_checker.service_checker.ServiceChecker.get_info')
|
||||
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
|
||||
@patch('health_checker.utils.get_uptime')
|
||||
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
||||
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
|
||||
chassis = MagicMock()
|
||||
chassis.set_status_led = MagicMock()
|
||||
|
||||
manager = HealthCheckerManager()
|
||||
manager.config.user_defined_checkers = ['some check']
|
||||
assert manager._state == HealthCheckerManager.STATE_BOOTING
|
||||
assert len(manager._checkers) == 2
|
||||
|
||||
mock_uptime.return_value = 200
|
||||
assert manager._is_system_booting()
|
||||
state, stat = manager.check(chassis)
|
||||
assert state == HealthCheckerManager.STATE_BOOTING
|
||||
assert len(stat) == 0
|
||||
chassis.set_status_led.assert_called_with('orange_blink')
|
||||
|
||||
mock_uptime.return_value = 500
|
||||
assert not manager._is_system_booting()
|
||||
assert manager._state == HealthCheckerManager.STATE_RUNNING
|
||||
mock_hw_info.return_value = {
|
||||
'ASIC': {
|
||||
'type': 'ASIC',
|
||||
@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
||||
'status': 'OK'
|
||||
}
|
||||
}
|
||||
state, stat = manager.check(chassis)
|
||||
assert state == HealthCheckerManager.STATE_RUNNING
|
||||
stat = manager.check(chassis)
|
||||
assert 'Services' in stat
|
||||
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
|
||||
|
||||
@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
|
||||
mock_hw_info.side_effect = RuntimeError()
|
||||
mock_service_info.side_effect = RuntimeError()
|
||||
mock_udc_info.side_effect = RuntimeError()
|
||||
state, stat = manager.check(chassis)
|
||||
stat = manager.check(chassis)
|
||||
assert 'Internal' in stat
|
||||
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
|
||||
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
|
||||
@ -518,6 +505,3 @@ def test_utils():
|
||||
|
||||
output = utils.run_command('ls')
|
||||
assert output
|
||||
|
||||
uptime = utils.get_uptime()
|
||||
assert uptime > 0
|
||||
|
Loading…
Reference in New Issue
Block a user