Make system health service start early (#9792)

- Why I did it
For SYSTEM READY feature. Currently, there is a booting stage in system health service to indicate that the system is loading SONiC component. This booting stage is no longer needed because SYSTEM READY feature will treat that stage as system "NOT READY".

- How I did it
1. Remove booting stage
2. Adjust unit test cases

- How to verify it
Manual test, Unit test, sonic-mgmt Regression
This commit is contained in:
Junchao-Mellanox 2022-01-27 19:46:52 +08:00 committed by GitHub
parent 43e967d6a4
commit c06cb219e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 12 additions and 49 deletions

View File

@ -1,4 +1,3 @@
from . import utils
from .config import Config from .config import Config
from .health_checker import HealthChecker from .health_checker import HealthChecker
from .service_checker import ServiceChecker from .service_checker import ServiceChecker
@ -10,14 +9,10 @@ class HealthCheckerManager(object):
""" """
Manage all system health checkers and system health configuration. Manage all system health checkers and system health configuration.
""" """
STATE_BOOTING = 'booting'
STATE_RUNNING = 'running'
boot_timeout = None boot_timeout = None
def __init__(self): def __init__(self):
self._checkers = [] self._checkers = []
self._state = self.STATE_BOOTING
self.config = Config() self.config = Config()
self.initialize() self.initialize()
@ -33,17 +28,11 @@ class HealthCheckerManager(object):
""" """
Load new configuration if any and perform the system health check for all existing checkers. Load new configuration if any and perform the system health check for all existing checkers.
:param chassis: A chassis object. :param chassis: A chassis object.
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that :return: A dictionary that contains the status for all objects that was checked.
contains the status for all objects that was checked.
""" """
HealthChecker.summary = HealthChecker.STATUS_OK HealthChecker.summary = HealthChecker.STATUS_OK
stats = {} stats = {}
self.config.load_config() self.config.load_config()
# check state first to avoid user change boot timeout in configuration file
# after finishing system boot
if self._state == self.STATE_BOOTING and self._is_system_booting():
self._set_system_led(chassis, self.config, 'booting')
return self._state, stats
for checker in self._checkers: for checker in self._checkers:
self._do_check(checker, stats) self._do_check(checker, stats)
@ -56,7 +45,7 @@ class HealthCheckerManager(object):
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault' led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
self._set_system_led(chassis, self.config, led_status) self._set_system_led(chassis, self.config, led_status)
return self._state, stats return stats
def _do_check(self, checker, stats): def _do_check(self, checker, stats):
""" """
@ -86,15 +75,6 @@ class HealthCheckerManager(object):
else: else:
stats['Internal'].update(entry) stats['Internal'].update(entry)
def _is_system_booting(self):
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
if not booting:
self._state = self.STATE_RUNNING
return booting
def _set_system_led(self, chassis, config, status): def _set_system_led(self, chassis, config, status):
try: try:
chassis.set_status_led(config.get_led_color(status)) chassis.set_status_led(config.get_led_color(status))

View File

@ -218,7 +218,7 @@ class ServiceChecker(HealthChecker):
output = utils.run_command(ServiceChecker.CHECK_CMD) output = utils.run_command(ServiceChecker.CHECK_CMD)
lines = output.splitlines() lines = output.splitlines()
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
return return
status_begin = lines[1].find('Status') status_begin = lines[1].find('Status')

View File

@ -8,7 +8,7 @@ def run_command(command):
:return: Output of the shell command. :return: Output of the shell command.
""" """
try: try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE) process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0] return process.communicate()[0]
except Exception: except Exception:
return None return None

View File

@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'
class HealthDaemon(DaemonBase): class HealthDaemon(DaemonBase):
""" """
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
according to the check result and store the check result to redis. according to the check result and store the check result to redis.
""" """
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO' SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
def deinit(self): def deinit(self):
""" """
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table. Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
:return: :return:
""" """
self._clear_system_health_table() self._clear_system_health_table()
@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
def run(self): def run(self):
""" """
Check system health in an infinite loop. Check system health in an infinite loop.
:return: :return:
""" """
self.log_notice("Starting up...") self.log_notice("Starting up...")
@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
self.log_warning("System health configuration file not found, exit...") self.log_warning("System health configuration file not found, exit...")
return return
while 1: while 1:
state, stat = manager.check(chassis) stat = manager.check(chassis)
if state == HealthCheckerManager.STATE_RUNNING: self._process_stat(chassis, manager.config, stat)
self._process_stat(chassis, manager.config, stat)
if self.stop_event.wait(manager.config.interval): if self.stop_event.wait(manager.config.interval):
break break

View File

@ -439,26 +439,14 @@ def test_config():
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info') @patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
@patch('health_checker.service_checker.ServiceChecker.get_info') @patch('health_checker.service_checker.ServiceChecker.get_info')
@patch('health_checker.hardware_checker.HardwareChecker.get_info') @patch('health_checker.hardware_checker.HardwareChecker.get_info')
@patch('health_checker.utils.get_uptime') def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
chassis = MagicMock() chassis = MagicMock()
chassis.set_status_led = MagicMock() chassis.set_status_led = MagicMock()
manager = HealthCheckerManager() manager = HealthCheckerManager()
manager.config.user_defined_checkers = ['some check'] manager.config.user_defined_checkers = ['some check']
assert manager._state == HealthCheckerManager.STATE_BOOTING
assert len(manager._checkers) == 2 assert len(manager._checkers) == 2
mock_uptime.return_value = 200
assert manager._is_system_booting()
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_BOOTING
assert len(stat) == 0
chassis.set_status_led.assert_called_with('orange_blink')
mock_uptime.return_value = 500
assert not manager._is_system_booting()
assert manager._state == HealthCheckerManager.STATE_RUNNING
mock_hw_info.return_value = { mock_hw_info.return_value = {
'ASIC': { 'ASIC': {
'type': 'ASIC', 'type': 'ASIC',
@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
'status': 'OK' 'status': 'OK'
} }
} }
state, stat = manager.check(chassis) stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_RUNNING
assert 'Services' in stat assert 'Services' in stat
assert stat['Services']['snmp:snmpd']['status'] == 'OK' assert stat['Services']['snmp:snmpd']['status'] == 'OK'
@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
mock_hw_info.side_effect = RuntimeError() mock_hw_info.side_effect = RuntimeError()
mock_service_info.side_effect = RuntimeError() mock_service_info.side_effect = RuntimeError()
mock_udc_info.side_effect = RuntimeError() mock_udc_info.side_effect = RuntimeError()
state, stat = manager.check(chassis) stat = manager.check(chassis)
assert 'Internal' in stat assert 'Internal' in stat
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK' assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK' assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
@ -518,6 +505,3 @@ def test_utils():
output = utils.run_command('ls') output = utils.run_command('ls')
assert output assert output
uptime = utils.get_uptime()
assert uptime > 0