From 5df167b346611abadc004aabd8ad6186bcbd1194 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 15 Mar 2023 13:21:00 +0800 Subject: [PATCH] [system-health] Make check interval more accurate (#14085) - Why I did it Healthd check system status every 60 seconds. However, running checker may take several seconds. Say checker takes X seconds, healthd takes (60 + X) seconds to finish one iteration. This implementation makes sonic-mgmt test case not so stable because the value X is hard to predict and different among different platforms. This PR introduces an interval compensation mechanism to healthd main loop. - How I did it Introduces an interval compensation mechanism to healthd main loop: healthd should wait (60 - X) seconds for next iteration - How to verify it Manual test Unit test --- src/system-health/pytest.ini | 2 +- src/system-health/scripts/healthd | 22 ++++++++++---- src/system-health/tests/test_system_health.py | 29 +++++++++++++++++++ 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/system-health/pytest.ini b/src/system-health/pytest.ini index a9c5a74860..b5b94c91c3 100644 --- a/src/system-health/pytest.ini +++ b/src/system-health/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml +addopts = --cov=health_checker --cov=healthd --cov-report html --cov-report term --cov-report xml diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd index df52969d3a..cb404de29e 100644 --- a/src/system-health/scripts/healthd +++ b/src/system-health/scripts/healthd @@ -7,6 +7,7 @@ import signal import threading +import time from sonic_py_common.daemon_base import DaemonBase from swsscommon.swsscommon import SonicV2Connector @@ -79,18 +80,27 @@ class HealthDaemon(DaemonBase): return sysmon = Sysmonitor() sysmon.task_run() - while 1: - stat = manager.check(chassis) - self._process_stat(chassis, manager.config, stat) - - if self.stop_event.wait(manager.config.interval): - break + while self._run_checker(manager, chassis): + pass except ImportError: self.log_warning("sonic_platform package not installed. Cannot start system-health daemon") self.deinit() sysmon.task_stop() + def _run_checker(self, manager, chassis): + begin = time.time() + stat = manager.check(chassis) + self._process_stat(chassis, manager.config, stat) + elapse = time.time() - begin + sleep_time_in_sec = manager.config.interval - elapse + if sleep_time_in_sec < 0: + self.log_notice(f'System health takes {elapse} seconds for one iteration') + sleep_time_in_sec = 1 + if self.stop_event.wait(sleep_time_in_sec): + return False + return True + def _process_stat(self, chassis, config, stat): from health_checker.health_checker import HealthChecker self._clear_system_health_table() diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 4dd0462a89..cea80b48a3 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -12,6 +12,7 @@ import copy import os import sys +from imp import load_source from swsscommon import swsscommon from mock import Mock, MagicMock, patch @@ -23,7 +24,9 @@ swsscommon.SonicV2Connector = MockConnector test_path = os.path.dirname(os.path.abspath(__file__)) modules_path = os.path.dirname(test_path) +scripts_path = os.path.join(modules_path, 'scripts') sys.path.insert(0, modules_path) +sys.path.insert(0, scripts_path) from health_checker import utils from health_checker.config import Config from health_checker.hardware_checker import HardwareChecker @@ -35,6 +38,9 @@ from health_checker.sysmonitor import Sysmonitor from health_checker.sysmonitor import MonitorStateDbTask from health_checker.sysmonitor import MonitorSystemBusTask +load_source('healthd', os.path.join(scripts_path, 'healthd')) +from healthd import HealthDaemon + mock_supervisorctl_output = """ snmpd RUNNING pid 67, uptime 1:03:56 snmp-subagent EXITED Oct 19 01:53 AM @@ -740,3 +746,26 @@ def test_get_service_from_feature_table(): sysmon.get_service_from_feature_table(dir_list) assert 'bgp.service' in dir_list assert 'swss.service' not in dir_list + + +@patch('healthd.time.time') +def test_healthd_check_interval(mock_time): + daemon = HealthDaemon() + manager = MagicMock() + manager.check = MagicMock() + manager.config = MagicMock() + chassis = MagicMock() + daemon._process_stat = MagicMock() + daemon.stop_event = MagicMock() + daemon.stop_event.wait = MagicMock() + + daemon.stop_event.wait.return_value = False + manager.config.interval = 60 + mock_time.side_effect = [0, 3, 0, 61, 0, 1] + assert daemon._run_checker(manager, chassis) + daemon.stop_event.wait.assert_called_with(57) + assert daemon._run_checker(manager, chassis) + daemon.stop_event.wait.assert_called_with(1) + + daemon.stop_event.wait.return_value = True + assert not daemon._run_checker(manager, chassis)