[system-health] Make check interval more accurate (#14085)
- Why I did it Healthd check system status every 60 seconds. However, running checker may take several seconds. Say checker takes X seconds, healthd takes (60 + X) seconds to finish one iteration. This implementation makes sonic-mgmt test case not so stable because the value X is hard to predict and different among different platforms. This PR introduces an interval compensation mechanism to healthd main loop. - How I did it Introduces an interval compensation mechanism to healthd main loop: healthd should wait (60 - X) seconds for next iteration - How to verify it Manual test Unit test
This commit is contained in:
parent
7bba702f1e
commit
5df167b346
@ -1,2 +1,2 @@
|
|||||||
[pytest]
|
[pytest]
|
||||||
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml
|
addopts = --cov=health_checker --cov=healthd --cov-report html --cov-report term --cov-report xml
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
import signal
|
import signal
|
||||||
import threading
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
from sonic_py_common.daemon_base import DaemonBase
|
from sonic_py_common.daemon_base import DaemonBase
|
||||||
from swsscommon.swsscommon import SonicV2Connector
|
from swsscommon.swsscommon import SonicV2Connector
|
||||||
@ -79,18 +80,27 @@ class HealthDaemon(DaemonBase):
|
|||||||
return
|
return
|
||||||
sysmon = Sysmonitor()
|
sysmon = Sysmonitor()
|
||||||
sysmon.task_run()
|
sysmon.task_run()
|
||||||
while 1:
|
while self._run_checker(manager, chassis):
|
||||||
stat = manager.check(chassis)
|
pass
|
||||||
self._process_stat(chassis, manager.config, stat)
|
|
||||||
|
|
||||||
if self.stop_event.wait(manager.config.interval):
|
|
||||||
break
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
||||||
|
|
||||||
self.deinit()
|
self.deinit()
|
||||||
sysmon.task_stop()
|
sysmon.task_stop()
|
||||||
|
|
||||||
|
def _run_checker(self, manager, chassis):
|
||||||
|
begin = time.time()
|
||||||
|
stat = manager.check(chassis)
|
||||||
|
self._process_stat(chassis, manager.config, stat)
|
||||||
|
elapse = time.time() - begin
|
||||||
|
sleep_time_in_sec = manager.config.interval - elapse
|
||||||
|
if sleep_time_in_sec < 0:
|
||||||
|
self.log_notice(f'System health takes {elapse} seconds for one iteration')
|
||||||
|
sleep_time_in_sec = 1
|
||||||
|
if self.stop_event.wait(sleep_time_in_sec):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _process_stat(self, chassis, config, stat):
|
def _process_stat(self, chassis, config, stat):
|
||||||
from health_checker.health_checker import HealthChecker
|
from health_checker.health_checker import HealthChecker
|
||||||
self._clear_system_health_table()
|
self._clear_system_health_table()
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from imp import load_source
|
||||||
from swsscommon import swsscommon
|
from swsscommon import swsscommon
|
||||||
|
|
||||||
from mock import Mock, MagicMock, patch
|
from mock import Mock, MagicMock, patch
|
||||||
@ -23,7 +24,9 @@ swsscommon.SonicV2Connector = MockConnector
|
|||||||
|
|
||||||
test_path = os.path.dirname(os.path.abspath(__file__))
|
test_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
modules_path = os.path.dirname(test_path)
|
modules_path = os.path.dirname(test_path)
|
||||||
|
scripts_path = os.path.join(modules_path, 'scripts')
|
||||||
sys.path.insert(0, modules_path)
|
sys.path.insert(0, modules_path)
|
||||||
|
sys.path.insert(0, scripts_path)
|
||||||
from health_checker import utils
|
from health_checker import utils
|
||||||
from health_checker.config import Config
|
from health_checker.config import Config
|
||||||
from health_checker.hardware_checker import HardwareChecker
|
from health_checker.hardware_checker import HardwareChecker
|
||||||
@ -35,6 +38,9 @@ from health_checker.sysmonitor import Sysmonitor
|
|||||||
from health_checker.sysmonitor import MonitorStateDbTask
|
from health_checker.sysmonitor import MonitorStateDbTask
|
||||||
from health_checker.sysmonitor import MonitorSystemBusTask
|
from health_checker.sysmonitor import MonitorSystemBusTask
|
||||||
|
|
||||||
|
load_source('healthd', os.path.join(scripts_path, 'healthd'))
|
||||||
|
from healthd import HealthDaemon
|
||||||
|
|
||||||
mock_supervisorctl_output = """
|
mock_supervisorctl_output = """
|
||||||
snmpd RUNNING pid 67, uptime 1:03:56
|
snmpd RUNNING pid 67, uptime 1:03:56
|
||||||
snmp-subagent EXITED Oct 19 01:53 AM
|
snmp-subagent EXITED Oct 19 01:53 AM
|
||||||
@ -740,3 +746,26 @@ def test_get_service_from_feature_table():
|
|||||||
sysmon.get_service_from_feature_table(dir_list)
|
sysmon.get_service_from_feature_table(dir_list)
|
||||||
assert 'bgp.service' in dir_list
|
assert 'bgp.service' in dir_list
|
||||||
assert 'swss.service' not in dir_list
|
assert 'swss.service' not in dir_list
|
||||||
|
|
||||||
|
|
||||||
|
@patch('healthd.time.time')
|
||||||
|
def test_healthd_check_interval(mock_time):
|
||||||
|
daemon = HealthDaemon()
|
||||||
|
manager = MagicMock()
|
||||||
|
manager.check = MagicMock()
|
||||||
|
manager.config = MagicMock()
|
||||||
|
chassis = MagicMock()
|
||||||
|
daemon._process_stat = MagicMock()
|
||||||
|
daemon.stop_event = MagicMock()
|
||||||
|
daemon.stop_event.wait = MagicMock()
|
||||||
|
|
||||||
|
daemon.stop_event.wait.return_value = False
|
||||||
|
manager.config.interval = 60
|
||||||
|
mock_time.side_effect = [0, 3, 0, 61, 0, 1]
|
||||||
|
assert daemon._run_checker(manager, chassis)
|
||||||
|
daemon.stop_event.wait.assert_called_with(57)
|
||||||
|
assert daemon._run_checker(manager, chassis)
|
||||||
|
daemon.stop_event.wait.assert_called_with(1)
|
||||||
|
|
||||||
|
daemon.stop_event.wait.return_value = True
|
||||||
|
assert not daemon._run_checker(manager, chassis)
|
||||||
|
Loading…
Reference in New Issue
Block a user