[system-health] Make check interval more accurate (#14085)

- Why I did it

Healthd check system status every 60 seconds. However, running checker may take several seconds. Say checker takes X seconds, healthd takes (60 + X) seconds to finish one iteration. This implementation makes sonic-mgmt test case not so stable because the value X is hard to predict and different among different platforms. This PR introduces an interval
compensation mechanism to healthd main loop.

- How I did it

Introduces an interval compensation mechanism to healthd main loop: healthd should wait (60 - X) seconds for next iteration

- How to verify it

Manual test
Unit test
This commit is contained in:
Junchao-Mellanox 2023-03-15 13:21:00 +08:00 committed by mssonicbld
parent 7bba702f1e
commit 5df167b346
3 changed files with 46 additions and 7 deletions

View File

@ -1,2 +1,2 @@
[pytest]
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml
addopts = --cov=health_checker --cov=healthd --cov-report html --cov-report term --cov-report xml

View File

@ -7,6 +7,7 @@
import signal
import threading
import time
from sonic_py_common.daemon_base import DaemonBase
from swsscommon.swsscommon import SonicV2Connector
@ -79,18 +80,27 @@ class HealthDaemon(DaemonBase):
return
sysmon = Sysmonitor()
sysmon.task_run()
while 1:
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)
if self.stop_event.wait(manager.config.interval):
break
while self._run_checker(manager, chassis):
pass
except ImportError:
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
self.deinit()
sysmon.task_stop()
def _run_checker(self, manager, chassis):
begin = time.time()
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)
elapse = time.time() - begin
sleep_time_in_sec = manager.config.interval - elapse
if sleep_time_in_sec < 0:
self.log_notice(f'System health takes {elapse} seconds for one iteration')
sleep_time_in_sec = 1
if self.stop_event.wait(sleep_time_in_sec):
return False
return True
def _process_stat(self, chassis, config, stat):
from health_checker.health_checker import HealthChecker
self._clear_system_health_table()

View File

@ -12,6 +12,7 @@
import copy
import os
import sys
from imp import load_source
from swsscommon import swsscommon
from mock import Mock, MagicMock, patch
@ -23,7 +24,9 @@ swsscommon.SonicV2Connector = MockConnector
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
scripts_path = os.path.join(modules_path, 'scripts')
sys.path.insert(0, modules_path)
sys.path.insert(0, scripts_path)
from health_checker import utils
from health_checker.config import Config
from health_checker.hardware_checker import HardwareChecker
@ -35,6 +38,9 @@ from health_checker.sysmonitor import Sysmonitor
from health_checker.sysmonitor import MonitorStateDbTask
from health_checker.sysmonitor import MonitorSystemBusTask
load_source('healthd', os.path.join(scripts_path, 'healthd'))
from healthd import HealthDaemon
mock_supervisorctl_output = """
snmpd RUNNING pid 67, uptime 1:03:56
snmp-subagent EXITED Oct 19 01:53 AM
@ -740,3 +746,26 @@ def test_get_service_from_feature_table():
sysmon.get_service_from_feature_table(dir_list)
assert 'bgp.service' in dir_list
assert 'swss.service' not in dir_list
@patch('healthd.time.time')
def test_healthd_check_interval(mock_time):
daemon = HealthDaemon()
manager = MagicMock()
manager.check = MagicMock()
manager.config = MagicMock()
chassis = MagicMock()
daemon._process_stat = MagicMock()
daemon.stop_event = MagicMock()
daemon.stop_event.wait = MagicMock()
daemon.stop_event.wait.return_value = False
manager.config.interval = 60
mock_time.side_effect = [0, 3, 0, 61, 0, 1]
assert daemon._run_checker(manager, chassis)
daemon.stop_event.wait.assert_called_with(57)
assert daemon._run_checker(manager, chassis)
daemon.stop_event.wait.assert_called_with(1)
daemon.stop_event.wait.return_value = True
assert not daemon._run_checker(manager, chassis)