Make system health service start early (#9792)

- Why I did it
For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY".

- How I did it
1. Remove booting stage
2. Adjust unit test cases

- How to verify it
Manual test, Unit test, sonic-mgmt Regression
This commit is contained in:
Junchao-Mellanox 2022-01-27 19:46:52 +08:00 committed by GitHub
parent 43e967d6a4
commit c06cb219e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 12 additions and 49 deletions

View File

@ -1,4 +1,3 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
@ -10,14 +9,10 @@ class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
"""
STATE_BOOTING = 'booting'
STATE_RUNNING = 'running'
boot_timeout = None
def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING
self.config = Config()
self.initialize()
@ -33,17 +28,11 @@ class HealthCheckerManager(object):
"""
Load new configuration if any and perform the system health check for all existing checkers.
:param chassis: A chassis object.
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
:return: A dictionary that contains the status for all objects that was checked.
"""
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
# check state first to avoid user change boot timeout in configuration file
# after finishing system boot
if self._state == self.STATE_BOOTING and self._is_system_booting():
self._set_system_led(chassis, self.config, 'booting')
return self._state, stats
for checker in self._checkers:
self._do_check(checker, stats)
@ -56,7 +45,7 @@ class HealthCheckerManager(object):
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
self._set_system_led(chassis, self.config, led_status)
return self._state, stats
return stats
def _do_check(self, checker, stats):
"""
@ -86,15 +75,6 @@ class HealthCheckerManager(object):
else:
stats['Internal'].update(entry)
def _is_system_booting(self):
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
if not booting:
self._state = self.STATE_RUNNING
return booting
def _set_system_led(self, chassis, config, status):
try:
chassis.set_status_led(config.get_led_color(status))

View File

@ -218,7 +218,7 @@ class ServiceChecker(HealthChecker):
output = utils.run_command(ServiceChecker.CHECK_CMD)
lines = output.splitlines()
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
return
status_begin = lines[1].find('Status')

View File

@ -8,7 +8,7 @@ def run_command(command):
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0]
except Exception:
return None

View File

@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'
class HealthDaemon(DaemonBase):
"""
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
according to the check result and store the check result to redis.
"""
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
def deinit(self):
"""
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
:return:
:return:
"""
self._clear_system_health_table()
@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
def run(self):
"""
Check system health in an infinite loop.
:return:
:return:
"""
self.log_notice("Starting up...")
@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
self.log_warning("System health configuration file not found, exit...")
return
while 1:
state, stat = manager.check(chassis)
if state == HealthCheckerManager.STATE_RUNNING:
self._process_stat(chassis, manager.config, stat)
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)
if self.stop_event.wait(manager.config.interval):
break

View File

@ -439,26 +439,14 @@ def test_config():
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
@patch('health_checker.service_checker.ServiceChecker.get_info')
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
@patch('health_checker.utils.get_uptime')
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
chassis = MagicMock()
chassis.set_status_led = MagicMock()
manager = HealthCheckerManager()
manager.config.user_defined_checkers = ['some check']
assert manager._state == HealthCheckerManager.STATE_BOOTING
assert len(manager._checkers) == 2
mock_uptime.return_value = 200
assert manager._is_system_booting()
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_BOOTING
assert len(stat) == 0
chassis.set_status_led.assert_called_with('orange_blink')
mock_uptime.return_value = 500
assert not manager._is_system_booting()
assert manager._state == HealthCheckerManager.STATE_RUNNING
mock_hw_info.return_value = {
'ASIC': {
'type': 'ASIC',
@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
'status': 'OK'
}
}
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_RUNNING
stat = manager.check(chassis)
assert 'Services' in stat
assert stat['Services']['snmp:snmpd']['status'] == 'OK'
@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
mock_hw_info.side_effect = RuntimeError()
mock_service_info.side_effect = RuntimeError()
mock_udc_info.side_effect = RuntimeError()
state, stat = manager.check(chassis)
stat = manager.check(chassis)
assert 'Internal' in stat
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
@ -518,6 +505,3 @@ def test_utils():
output = utils.run_command('ls')
assert output
uptime = utils.get_uptime()
assert uptime > 0