[system-health] Fix error log system_service'state' while doing confi… (#11225)
- Why I did it While doing config reload, FEATURE table may be removed and re-add. During this process, updating FEATURE table is not atomic. It could be that the FEATURE table has entry, but each entry has no field. This PR introduces a retry mechanism to avoid this. - How I did it Introduces a retry mechanism to avoid this. - How to verify it New unit test added to verify the flow as well as running some manual test.
This commit is contained in:
parent
97cb613729
commit
097e315f08
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import glob
|
import glob
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -145,12 +146,7 @@ class Sysmonitor(ProcessTaskBase):
|
|||||||
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]
|
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]
|
||||||
|
|
||||||
#add the enabled docker services from config db feature table
|
#add the enabled docker services from config db feature table
|
||||||
feature_table = self.config_db.get_table("FEATURE")
|
self.get_service_from_feature_table(dir_list)
|
||||||
for srv in feature_table.keys():
|
|
||||||
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
|
|
||||||
srvext = srv + ".service"
|
|
||||||
if srvext not in dir_list:
|
|
||||||
dir_list.append(srvext)
|
|
||||||
|
|
||||||
self.config.load_config()
|
self.config.load_config()
|
||||||
if self.config and self.config.ignore_services:
|
if self.config and self.config.ignore_services:
|
||||||
@ -161,6 +157,41 @@ class Sysmonitor(ProcessTaskBase):
|
|||||||
dir_list.sort()
|
dir_list.sort()
|
||||||
return dir_list
|
return dir_list
|
||||||
|
|
||||||
|
def get_service_from_feature_table(self, dir_list):
|
||||||
|
"""Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table
|
||||||
|
is not an atomic operation, sonic-cfggen do it with two steps:
|
||||||
|
1. Add an empty table entry to CONFIG DB
|
||||||
|
2. Add all fields to the table
|
||||||
|
|
||||||
|
So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry
|
||||||
|
mechanism is here to avoid such issue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dir_list (list): service list
|
||||||
|
"""
|
||||||
|
max_retry = 3
|
||||||
|
retry_delay = 1
|
||||||
|
success = True
|
||||||
|
|
||||||
|
while max_retry > 0:
|
||||||
|
success = True
|
||||||
|
feature_table = self.config_db.get_table("FEATURE")
|
||||||
|
for srv, fields in feature_table.items():
|
||||||
|
if 'state' not in fields:
|
||||||
|
success = False
|
||||||
|
logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table))
|
||||||
|
break
|
||||||
|
if fields["state"] not in ["disabled", "always_disabled"]:
|
||||||
|
srvext = srv + ".service"
|
||||||
|
if srvext not in dir_list:
|
||||||
|
dir_list.append(srvext)
|
||||||
|
if not success:
|
||||||
|
max_retry -= 1
|
||||||
|
time.sleep(retry_delay)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if not success:
|
||||||
|
logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table))
|
||||||
|
|
||||||
#Checks FEATURE table from config db for the service' check_up_status flag
|
#Checks FEATURE table from config db for the service' check_up_status flag
|
||||||
#if marked to true, then read the service up_status from FEATURE table of state db.
|
#if marked to true, then read the service up_status from FEATURE table of state db.
|
||||||
|
@ -720,3 +720,23 @@ def test_system_service():
|
|||||||
sysmon.task_run()
|
sysmon.task_run()
|
||||||
assert sysmon._task_process is not None
|
assert sysmon._task_process is not None
|
||||||
sysmon.task_stop()
|
sysmon.task_stop()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_service_from_feature_table():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.config_db = MagicMock()
|
||||||
|
sysmon.config_db.get_table = MagicMock()
|
||||||
|
sysmon.config_db.get_table.side_effect = [
|
||||||
|
{
|
||||||
|
'bgp': {},
|
||||||
|
'swss': {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'bgp': {'state': 'enabled'},
|
||||||
|
'swss': {'state': 'disabled'}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
dir_list = []
|
||||||
|
sysmon.get_service_from_feature_table(dir_list)
|
||||||
|
assert 'bgp.service' in dir_list
|
||||||
|
assert 'swss.service' not in dir_list
|
||||||
|
Reference in New Issue
Block a user