[system-health] Make check interval more accurate (#14085)
- Why I did it Healthd check system status every 60 seconds. However, running checker may take several seconds. Say checker takes X seconds, healthd takes (60 + X) seconds to finish one iteration. This implementation makes sonic-mgmt test case not so stable because the value X is hard to predict and different among different platforms. This PR introduces an interval compensation mechanism to healthd main loop. - How I did it Introduces an interval compensation mechanism to healthd main loop: healthd should wait (60 - X) seconds for next iteration - How to verify it Manual test Unit test
This commit is contained in:
parent
7bba702f1e
commit
5df167b346
@ -1,2 +1,2 @@
|
||||
[pytest]
|
||||
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml
|
||||
addopts = --cov=health_checker --cov=healthd --cov-report html --cov-report term --cov-report xml
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
import signal
|
||||
import threading
|
||||
import time
|
||||
|
||||
from sonic_py_common.daemon_base import DaemonBase
|
||||
from swsscommon.swsscommon import SonicV2Connector
|
||||
@ -79,18 +80,27 @@ class HealthDaemon(DaemonBase):
|
||||
return
|
||||
sysmon = Sysmonitor()
|
||||
sysmon.task_run()
|
||||
while 1:
|
||||
stat = manager.check(chassis)
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
|
||||
if self.stop_event.wait(manager.config.interval):
|
||||
break
|
||||
while self._run_checker(manager, chassis):
|
||||
pass
|
||||
except ImportError:
|
||||
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
||||
|
||||
self.deinit()
|
||||
sysmon.task_stop()
|
||||
|
||||
def _run_checker(self, manager, chassis):
|
||||
begin = time.time()
|
||||
stat = manager.check(chassis)
|
||||
self._process_stat(chassis, manager.config, stat)
|
||||
elapse = time.time() - begin
|
||||
sleep_time_in_sec = manager.config.interval - elapse
|
||||
if sleep_time_in_sec < 0:
|
||||
self.log_notice(f'System health takes {elapse} seconds for one iteration')
|
||||
sleep_time_in_sec = 1
|
||||
if self.stop_event.wait(sleep_time_in_sec):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _process_stat(self, chassis, config, stat):
|
||||
from health_checker.health_checker import HealthChecker
|
||||
self._clear_system_health_table()
|
||||
|
@ -12,6 +12,7 @@
|
||||
import copy
|
||||
import os
|
||||
import sys
|
||||
from imp import load_source
|
||||
from swsscommon import swsscommon
|
||||
|
||||
from mock import Mock, MagicMock, patch
|
||||
@ -23,7 +24,9 @@ swsscommon.SonicV2Connector = MockConnector
|
||||
|
||||
test_path = os.path.dirname(os.path.abspath(__file__))
|
||||
modules_path = os.path.dirname(test_path)
|
||||
scripts_path = os.path.join(modules_path, 'scripts')
|
||||
sys.path.insert(0, modules_path)
|
||||
sys.path.insert(0, scripts_path)
|
||||
from health_checker import utils
|
||||
from health_checker.config import Config
|
||||
from health_checker.hardware_checker import HardwareChecker
|
||||
@ -35,6 +38,9 @@ from health_checker.sysmonitor import Sysmonitor
|
||||
from health_checker.sysmonitor import MonitorStateDbTask
|
||||
from health_checker.sysmonitor import MonitorSystemBusTask
|
||||
|
||||
load_source('healthd', os.path.join(scripts_path, 'healthd'))
|
||||
from healthd import HealthDaemon
|
||||
|
||||
mock_supervisorctl_output = """
|
||||
snmpd RUNNING pid 67, uptime 1:03:56
|
||||
snmp-subagent EXITED Oct 19 01:53 AM
|
||||
@ -740,3 +746,26 @@ def test_get_service_from_feature_table():
|
||||
sysmon.get_service_from_feature_table(dir_list)
|
||||
assert 'bgp.service' in dir_list
|
||||
assert 'swss.service' not in dir_list
|
||||
|
||||
|
||||
@patch('healthd.time.time')
|
||||
def test_healthd_check_interval(mock_time):
|
||||
daemon = HealthDaemon()
|
||||
manager = MagicMock()
|
||||
manager.check = MagicMock()
|
||||
manager.config = MagicMock()
|
||||
chassis = MagicMock()
|
||||
daemon._process_stat = MagicMock()
|
||||
daemon.stop_event = MagicMock()
|
||||
daemon.stop_event.wait = MagicMock()
|
||||
|
||||
daemon.stop_event.wait.return_value = False
|
||||
manager.config.interval = 60
|
||||
mock_time.side_effect = [0, 3, 0, 61, 0, 1]
|
||||
assert daemon._run_checker(manager, chassis)
|
||||
daemon.stop_event.wait.assert_called_with(57)
|
||||
assert daemon._run_checker(manager, chassis)
|
||||
daemon.stop_event.wait.assert_called_with(1)
|
||||
|
||||
daemon.stop_event.wait.return_value = True
|
||||
assert not daemon._run_checker(manager, chassis)
|
||||
|
Loading…
Reference in New Issue
Block a user