System Ready (#10479)
Why I did it At present, there is no mechanism in an event driven model to know that the system is up with all the essential sonic services and also, all the docker apps are ready along with port ready status to start the network traffic. With the asynchronous architecture of SONiC, we will not be able to verify if the config has been applied all the way down to the HW. But we can get the closest up status of each app and arrive at the system readiness. How I did it A new python based system monitor tool is introduced under system-health framework to monitor all the essential system host services including docker wrapper services on an event based model and declare the system is ready. This framework gives provision for docker apps to notify its closest up status. CLIs are provided to fetch the current system status and also service running status and its app ready status along with failure reason if any. How to verify it "show system-health sysready-status" click CLI Syslogs for system ready
This commit is contained in:
parent
f6927606b3
commit
f37dd770cd
@ -57,6 +57,12 @@
|
|||||||
"has_global_scope": {% if feature + '.service' in installer_services.split(' ') %}true{% else %}false{% endif %},
|
"has_global_scope": {% if feature + '.service' in installer_services.split(' ') %}true{% else %}false{% endif %},
|
||||||
"has_per_asic_scope": {% if feature + '@.service' in installer_services.split(' ') %}true{% else %}false{% endif %},
|
"has_per_asic_scope": {% if feature + '@.service' in installer_services.split(' ') %}true{% else %}false{% endif %},
|
||||||
"auto_restart": "{{autorestart}}",
|
"auto_restart": "{{autorestart}}",
|
||||||
|
{# Set check_up_status to true here when app readiness will be marked in state db #}
|
||||||
|
{# For now, to support the infrastrucure, setting the check_up_status to false for bgp,swss,pmon #}
|
||||||
|
{# Once apps like bgp,synd supports app readiness, then bgp,syncd can set check_up_status to true #}
|
||||||
|
{%- if feature in ["bgp", "swss", "pmon"] %}
|
||||||
|
"check_up_status" : "false",
|
||||||
|
{%- endif %}
|
||||||
{%- if include_kubernetes == "y" %}
|
{%- if include_kubernetes == "y" %}
|
||||||
{%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %}
|
{%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %}
|
||||||
"set_owner": "kube", {% else %}
|
"set_owner": "kube", {% else %}
|
||||||
|
@ -890,3 +890,6 @@ sudo cp $BUILD_SCRIPTS_DIR/mask_disabled_services.py $FILESYSTEM_ROOT/tmp/
|
|||||||
sudo chmod a+x $FILESYSTEM_ROOT/tmp/mask_disabled_services.py
|
sudo chmod a+x $FILESYSTEM_ROOT/tmp/mask_disabled_services.py
|
||||||
sudo LANG=C chroot $FILESYSTEM_ROOT /tmp/mask_disabled_services.py
|
sudo LANG=C chroot $FILESYSTEM_ROOT /tmp/mask_disabled_services.py
|
||||||
sudo rm -rf $FILESYSTEM_ROOT/tmp/mask_disabled_services.py
|
sudo rm -rf $FILESYSTEM_ROOT/tmp/mask_disabled_services.py
|
||||||
|
|
||||||
|
|
||||||
|
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install python3-dbus
|
||||||
|
@ -5,6 +5,7 @@ After=rc-local.service database.service
|
|||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
RemainAfterExit=yes
|
||||||
ExecStart=/usr/local/bin/determine-reboot-cause
|
ExecStart=/usr/local/bin/determine-reboot-cause
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
|
@ -86,6 +86,13 @@ module sonic-feature{
|
|||||||
type feature-owner;
|
type feature-owner;
|
||||||
default "local";
|
default "local";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
leaf check_up_status {
|
||||||
|
description "This configuration controls the system ready tool to check
|
||||||
|
the app ready/up status";
|
||||||
|
type boolean;
|
||||||
|
default false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
436
src/system-health/health_checker/sysmonitor.py
Executable file
436
src/system-health/health_checker/sysmonitor.py
Executable file
@ -0,0 +1,436 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import glob
|
||||||
|
import multiprocessing
|
||||||
|
from datetime import datetime
|
||||||
|
from swsscommon import swsscommon
|
||||||
|
from sonic_py_common.logger import Logger
|
||||||
|
from . import utils
|
||||||
|
from sonic_py_common.task_base import ProcessTaskBase
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
SYSLOG_IDENTIFIER = "system#monitor"
|
||||||
|
REDIS_TIMEOUT_MS = 0
|
||||||
|
system_allsrv_state = "DOWN"
|
||||||
|
spl_srv_list = ['database-chassis', 'gbsyncd']
|
||||||
|
SELECT_TIMEOUT_MSECS = 1000
|
||||||
|
QUEUE_TIMEOUT = 15
|
||||||
|
TASK_STOP_TIMEOUT = 10
|
||||||
|
mpmgr = multiprocessing.Manager()
|
||||||
|
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)
|
||||||
|
|
||||||
|
|
||||||
|
#Subprocess which subscribes to STATE_DB FEATURE table for any update
|
||||||
|
#and push service events to main process via queue
|
||||||
|
class MonitorStateDbTask(ProcessTaskBase):
|
||||||
|
|
||||||
|
def __init__(self,myQ):
|
||||||
|
ProcessTaskBase.__init__(self)
|
||||||
|
self.task_queue = myQ
|
||||||
|
|
||||||
|
def subscribe_statedb(self):
|
||||||
|
state_db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, True)
|
||||||
|
sel = swsscommon.Select()
|
||||||
|
cst = swsscommon.SubscriberStateTable(state_db, "FEATURE")
|
||||||
|
sel.addSelectable(cst)
|
||||||
|
|
||||||
|
while not self.task_stopping_event.is_set():
|
||||||
|
(state, c) = sel.select(SELECT_TIMEOUT_MSECS)
|
||||||
|
if state == swsscommon.Select.TIMEOUT:
|
||||||
|
continue
|
||||||
|
if state != swsscommon.Select.OBJECT:
|
||||||
|
logger.log_warning("sel.select() did not return swsscommon.Select.OBJECT")
|
||||||
|
continue
|
||||||
|
(key, op, cfvs) = cst.pop()
|
||||||
|
key_ext = key + ".service"
|
||||||
|
timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
|
msg={"unit": key_ext, "evt_src":"feature", "time":timestamp}
|
||||||
|
self.task_notify(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def task_worker(self):
|
||||||
|
if self.task_stopping_event.is_set():
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self.subscribe_statedb()
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("subscribe_statedb exited- {}".format(str(e)))
|
||||||
|
|
||||||
|
def task_notify(self, msg):
|
||||||
|
if self.task_stopping_event.is_set():
|
||||||
|
return
|
||||||
|
self.task_queue.put(msg)
|
||||||
|
|
||||||
|
|
||||||
|
#Subprocess which subscribes to system dbus to listen for systemd events
|
||||||
|
#and push service events to main process via queue
|
||||||
|
class MonitorSystemBusTask(ProcessTaskBase):
|
||||||
|
|
||||||
|
def __init__(self,myQ):
|
||||||
|
ProcessTaskBase.__init__(self)
|
||||||
|
self.task_queue = myQ
|
||||||
|
|
||||||
|
def on_job_removed(self, id, job, unit, result):
|
||||||
|
if result == "done":
|
||||||
|
timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
|
msg = {"unit": unit, "evt_src":"sysbus", "time":timestamp}
|
||||||
|
self.task_notify(msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
#Function for listening the systemd event on dbus
|
||||||
|
def subscribe_sysbus(self):
|
||||||
|
import dbus
|
||||||
|
from gi.repository import GLib
|
||||||
|
from dbus.mainloop.glib import DBusGMainLoop
|
||||||
|
|
||||||
|
DBusGMainLoop(set_as_default=True)
|
||||||
|
bus = dbus.SystemBus()
|
||||||
|
systemd = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1')
|
||||||
|
manager = dbus.Interface(systemd, 'org.freedesktop.systemd1.Manager')
|
||||||
|
manager.Subscribe()
|
||||||
|
manager.connect_to_signal('JobRemoved', self.on_job_removed)
|
||||||
|
|
||||||
|
loop = GLib.MainLoop()
|
||||||
|
loop.run()
|
||||||
|
|
||||||
|
def task_worker(self):
|
||||||
|
if self.task_stopping_event.is_set():
|
||||||
|
return
|
||||||
|
logger.log_info("Start Listening to systemd bus (pid {0})".format(os.getpid()))
|
||||||
|
self.subscribe_sysbus()
|
||||||
|
|
||||||
|
def task_notify(self, msg):
|
||||||
|
if self.task_stopping_event.is_set():
|
||||||
|
return
|
||||||
|
self.task_queue.put(msg)
|
||||||
|
|
||||||
|
#Mainprocess which launches 2 subtasks - systembus task and statedb task
|
||||||
|
#and on receiving events, checks and updates the system ready status to state db
|
||||||
|
class Sysmonitor(ProcessTaskBase):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
ProcessTaskBase.__init__(self)
|
||||||
|
self._stop_timeout_secs = TASK_STOP_TIMEOUT
|
||||||
|
self.dnsrvs_name = set()
|
||||||
|
self.state_db = None
|
||||||
|
self.config_db = None
|
||||||
|
self.config = Config()
|
||||||
|
|
||||||
|
#Sets system ready status to state db
|
||||||
|
def post_system_status(self, state):
|
||||||
|
try:
|
||||||
|
if not self.state_db:
|
||||||
|
self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1')
|
||||||
|
self.state_db.connect(self.state_db.STATE_DB)
|
||||||
|
|
||||||
|
self.state_db.set(self.state_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE", "Status", state)
|
||||||
|
logger.log_info("Posting system ready status {} to statedb".format(state))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("Unable to post system ready status: {}".format(str(e)))
|
||||||
|
|
||||||
|
#Forms the service list to be monitored
|
||||||
|
def get_all_service_list(self):
|
||||||
|
|
||||||
|
if not self.config_db:
|
||||||
|
self.config_db = swsscommon.ConfigDBConnector()
|
||||||
|
self.config_db.connect()
|
||||||
|
|
||||||
|
dir_list = []
|
||||||
|
#add the services from the below targets
|
||||||
|
targets= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"]
|
||||||
|
for path in targets:
|
||||||
|
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]
|
||||||
|
|
||||||
|
#add the enabled docker services from config db feature table
|
||||||
|
feature_table = self.config_db.get_table("FEATURE")
|
||||||
|
for srv in feature_table.keys():
|
||||||
|
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
|
||||||
|
srvext = srv + ".service"
|
||||||
|
if srvext not in dir_list:
|
||||||
|
dir_list.append(srvext)
|
||||||
|
|
||||||
|
self.config.load_config()
|
||||||
|
if self.config and self.config.ignore_services:
|
||||||
|
for srv in self.config.ignore_services:
|
||||||
|
if srv in dir_list:
|
||||||
|
dir_list.remove(srv)
|
||||||
|
|
||||||
|
dir_list.sort()
|
||||||
|
return dir_list
|
||||||
|
|
||||||
|
|
||||||
|
#Checks FEATURE table from config db for the service' check_up_status flag
|
||||||
|
#if marked to true, then read the service up_status from FEATURE table of state db.
|
||||||
|
#else, just return Up
|
||||||
|
def get_app_ready_status(self, service):
|
||||||
|
if not self.state_db:
|
||||||
|
self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1')
|
||||||
|
self.state_db.connect(self.state_db.STATE_DB)
|
||||||
|
if not self.config_db:
|
||||||
|
self.config_db = swsscommon.ConfigDBConnector()
|
||||||
|
self.config_db.connect()
|
||||||
|
|
||||||
|
fail_reason = ""
|
||||||
|
check_app_up_status = ""
|
||||||
|
up_status_flag = ""
|
||||||
|
configdb_feature_table = self.config_db.get_table('FEATURE')
|
||||||
|
update_time = "-"
|
||||||
|
|
||||||
|
if service not in configdb_feature_table.keys():
|
||||||
|
pstate = "Up"
|
||||||
|
else:
|
||||||
|
check_app_up_status = configdb_feature_table[service].get('check_up_status')
|
||||||
|
if check_app_up_status is not None and (check_app_up_status.lower()) == "true":
|
||||||
|
up_status_flag = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'up_status')
|
||||||
|
if up_status_flag is not None and (up_status_flag.lower()) == "true":
|
||||||
|
pstate = "Up"
|
||||||
|
else:
|
||||||
|
fail_reason = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'fail_reason')
|
||||||
|
if fail_reason is None:
|
||||||
|
fail_reason = "NA"
|
||||||
|
pstate = "Down"
|
||||||
|
|
||||||
|
update_time = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'update_time')
|
||||||
|
if update_time is None:
|
||||||
|
update_time = "-"
|
||||||
|
else:
|
||||||
|
#Either check_up_status marked False or entry does not exist
|
||||||
|
pstate = "Up"
|
||||||
|
|
||||||
|
return pstate,fail_reason,update_time
|
||||||
|
|
||||||
|
#Gets the service properties
|
||||||
|
def run_systemctl_show(self, service):
|
||||||
|
command = ('systemctl show {} --property=Id,LoadState,UnitFileState,Type,ActiveState,SubState,Result'.format(service))
|
||||||
|
output = utils.run_command(command)
|
||||||
|
srv_properties = output.split('\n')
|
||||||
|
prop_dict = {}
|
||||||
|
for prop in srv_properties:
|
||||||
|
kv = prop.split("=", 1)
|
||||||
|
if len(kv) == 2:
|
||||||
|
prop_dict[kv[0]] = kv[1]
|
||||||
|
|
||||||
|
return prop_dict
|
||||||
|
|
||||||
|
#Sets the service status to state db
|
||||||
|
def post_unit_status(self, srv_name, srv_status, app_status, fail_reason, update_time):
|
||||||
|
if not self.state_db:
|
||||||
|
self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1')
|
||||||
|
self.state_db.connect(self.state_db.STATE_DB)
|
||||||
|
|
||||||
|
key = 'ALL_SERVICE_STATUS|{}'.format(srv_name)
|
||||||
|
statusvalue = {}
|
||||||
|
statusvalue['service_status'] = srv_status
|
||||||
|
statusvalue['app_ready_status'] = app_status
|
||||||
|
statusvalue['fail_reason'] = fail_reason
|
||||||
|
statusvalue['update_time'] = update_time
|
||||||
|
self.state_db.hmset(self.state_db.STATE_DB, key, statusvalue)
|
||||||
|
|
||||||
|
#Reads the current status of the service and posts it to state db
|
||||||
|
def get_unit_status(self, event):
|
||||||
|
""" Get a unit status"""
|
||||||
|
global spl_srv_list
|
||||||
|
unit_status = "NOT OK"
|
||||||
|
update_time = "-"
|
||||||
|
|
||||||
|
try:
|
||||||
|
service_status = "Down"
|
||||||
|
service_up_status = "Down"
|
||||||
|
service_name,last_name = event.split('.')
|
||||||
|
|
||||||
|
sysctl_show = self.run_systemctl_show(event)
|
||||||
|
|
||||||
|
load_state = sysctl_show['LoadState']
|
||||||
|
if load_state == "loaded":
|
||||||
|
status = sysctl_show['UnitFileState']
|
||||||
|
fail_reason = sysctl_show['Result']
|
||||||
|
active_state = sysctl_show['ActiveState']
|
||||||
|
sub_state = sysctl_show['SubState']
|
||||||
|
srv_type = sysctl_show['Type']
|
||||||
|
|
||||||
|
#Raise syslog for service state change
|
||||||
|
logger.log_info("{} service state changed to [{}/{}]".format(event, active_state, sub_state))
|
||||||
|
|
||||||
|
if status == "enabled" or status == "enabled-runtime" or status == "static":
|
||||||
|
if fail_reason == "success":
|
||||||
|
fail_reason = "-"
|
||||||
|
if (active_state == "active" and sub_state == "exited"):
|
||||||
|
service_status = "OK"
|
||||||
|
service_up_status = "OK"
|
||||||
|
unit_status = "OK"
|
||||||
|
elif active_state == "active" and sub_state == "running":
|
||||||
|
service_status = "OK"
|
||||||
|
init_state,app_fail_reason,update_time = self.get_app_ready_status(service_name)
|
||||||
|
if init_state == "Up":
|
||||||
|
service_up_status = "OK"
|
||||||
|
unit_status = "OK"
|
||||||
|
else:
|
||||||
|
fail_reason = app_fail_reason
|
||||||
|
unit_status = "NOT OK"
|
||||||
|
if fail_reason == "docker start":
|
||||||
|
service_up_status = "Starting"
|
||||||
|
fail_reason = "-"
|
||||||
|
elif active_state == "activating":
|
||||||
|
service_status = "Starting"
|
||||||
|
service_up_status = "Starting"
|
||||||
|
elif active_state == "deactivating":
|
||||||
|
service_status = "Stopping"
|
||||||
|
service_up_status = "Stopping"
|
||||||
|
elif active_state == "inactive":
|
||||||
|
if srv_type == "oneshot" or service_name in spl_srv_list:
|
||||||
|
service_status = "OK"
|
||||||
|
service_up_status = "OK"
|
||||||
|
unit_status = "OK"
|
||||||
|
else:
|
||||||
|
unit_status = "NOT OK"
|
||||||
|
if fail_reason == "-":
|
||||||
|
fail_reason = "Inactive"
|
||||||
|
else:
|
||||||
|
unit_status = "NOT OK"
|
||||||
|
|
||||||
|
self.post_unit_status(service_name, service_status, service_up_status, fail_reason, update_time)
|
||||||
|
|
||||||
|
return unit_status
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("Get unit status {}-{}".format(service_name, str(e)))
|
||||||
|
|
||||||
|
|
||||||
|
#Gets status of all the services from service list
|
||||||
|
def get_all_system_status(self):
|
||||||
|
""" Shows the system ready status"""
|
||||||
|
#global dnsrvs_name
|
||||||
|
scan_srv_list = []
|
||||||
|
|
||||||
|
scan_srv_list = self.get_all_service_list()
|
||||||
|
for service in scan_srv_list:
|
||||||
|
ustate = self.get_unit_status(service)
|
||||||
|
if ustate == "NOT OK":
|
||||||
|
if service not in self.dnsrvs_name:
|
||||||
|
self.dnsrvs_name.add(service)
|
||||||
|
|
||||||
|
if len(self.dnsrvs_name) == 0:
|
||||||
|
return "UP"
|
||||||
|
else:
|
||||||
|
return "DOWN"
|
||||||
|
|
||||||
|
#Displays the system ready status message on console
|
||||||
|
def print_console_message(self, message):
|
||||||
|
with open('/dev/console', 'w') as console:
|
||||||
|
console.write("\n{} {}\n".format(datetime.now().strftime("%b %d %H:%M:%S.%f"), message))
|
||||||
|
|
||||||
|
#Publish the system ready status message on logger,console and state db
|
||||||
|
def publish_system_status(self, astate):
|
||||||
|
global system_allsrv_state
|
||||||
|
if system_allsrv_state != astate:
|
||||||
|
system_allsrv_state = astate
|
||||||
|
if astate == "DOWN":
|
||||||
|
msg = "System is not ready - one or more services are not up"
|
||||||
|
elif astate == "UP":
|
||||||
|
msg = "System is ready"
|
||||||
|
logger.log_notice(msg)
|
||||||
|
self.print_console_message(msg)
|
||||||
|
self.post_system_status(astate)
|
||||||
|
|
||||||
|
#Checks all the services and updates the current system status
|
||||||
|
def update_system_status(self):
|
||||||
|
try:
|
||||||
|
astate = self.get_all_system_status()
|
||||||
|
self.publish_system_status(astate)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("update system status exception:{}".format(str(e)))
|
||||||
|
|
||||||
|
#Checks a service status and updates the system status
|
||||||
|
def check_unit_status(self, event):
|
||||||
|
#global dnsrvs_name
|
||||||
|
if not self.state_db:
|
||||||
|
self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1')
|
||||||
|
self.state_db.connect(self.state_db.STATE_DB)
|
||||||
|
astate = "DOWN"
|
||||||
|
|
||||||
|
full_srv_list = self.get_all_service_list()
|
||||||
|
if event in full_srv_list:
|
||||||
|
ustate = self.get_unit_status(event)
|
||||||
|
if ustate == "OK" and system_allsrv_state == "UP":
|
||||||
|
astate = "UP"
|
||||||
|
elif ustate == "OK" and system_allsrv_state == "DOWN":
|
||||||
|
if event in self.dnsrvs_name:
|
||||||
|
self.dnsrvs_name.remove(event)
|
||||||
|
if len(self.dnsrvs_name) == 0:
|
||||||
|
astate = "UP"
|
||||||
|
else:
|
||||||
|
astate = "DOWN"
|
||||||
|
else:
|
||||||
|
if event not in self.dnsrvs_name:
|
||||||
|
self.dnsrvs_name.add(event)
|
||||||
|
astate = "DOWN"
|
||||||
|
|
||||||
|
self.publish_system_status(astate)
|
||||||
|
else:
|
||||||
|
#if received event is not in current full service list but exists in STATE_DB & set,
|
||||||
|
#then it should be removed from STATE_DB & set
|
||||||
|
if event in self.dnsrvs_name:
|
||||||
|
self.dnsrvs_name.remove(event)
|
||||||
|
|
||||||
|
srv_name,last = event.split('.')
|
||||||
|
key = 'ALL_SERVICE_STATUS|{}'.format(srv_name)
|
||||||
|
key_exists = self.state_db.exists(self.state_db.STATE_DB, key)
|
||||||
|
if key_exists == 1:
|
||||||
|
self.state_db.delete(self.state_db.STATE_DB, key)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def system_service(self):
|
||||||
|
if not self.state_db:
|
||||||
|
self.state_db = swsscommon.SonicV2Connector(host='127.0.0.1')
|
||||||
|
self.state_db.connect(self.state_db.STATE_DB)
|
||||||
|
|
||||||
|
myQ = mpmgr.Queue()
|
||||||
|
try:
|
||||||
|
monitor_system_bus = MonitorSystemBusTask(myQ)
|
||||||
|
monitor_system_bus.task_run()
|
||||||
|
|
||||||
|
monitor_statedb_table = MonitorStateDbTask(myQ)
|
||||||
|
monitor_statedb_table.task_run()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("SubProcess-{}".format(str(e)))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
self.update_system_status()
|
||||||
|
|
||||||
|
from queue import Empty
|
||||||
|
# Queue to receive the STATEDB and Systemd state change event
|
||||||
|
while not self.task_stopping_event.is_set():
|
||||||
|
try:
|
||||||
|
msg = myQ.get(timeout=QUEUE_TIMEOUT)
|
||||||
|
event = msg["unit"]
|
||||||
|
event_src = msg["evt_src"]
|
||||||
|
event_time = msg["time"]
|
||||||
|
logger.log_debug("Main process- received event:{} from source:{} time:{}".format(event,event_src,event_time))
|
||||||
|
logger.log_info("check_unit_status for [ "+event+" ] ")
|
||||||
|
self.check_unit_status(event)
|
||||||
|
except Empty:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.log_error("system_service"+str(e))
|
||||||
|
|
||||||
|
#cleanup tables "'ALL_SERVICE_STATUS*', 'SYSTEM_READY*'" from statedb
|
||||||
|
self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "ALL_SERVICE_STATUS|*")
|
||||||
|
self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "SYSTEM_READY|*")
|
||||||
|
|
||||||
|
monitor_system_bus.task_stop()
|
||||||
|
monitor_statedb_table.task_stop()
|
||||||
|
|
||||||
|
def task_worker(self):
|
||||||
|
if self.task_stopping_event.is_set():
|
||||||
|
return
|
||||||
|
self.system_service()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -12,6 +12,8 @@ from sonic_py_common.daemon_base import DaemonBase
|
|||||||
from swsscommon.swsscommon import SonicV2Connector
|
from swsscommon.swsscommon import SonicV2Connector
|
||||||
|
|
||||||
from health_checker.manager import HealthCheckerManager
|
from health_checker.manager import HealthCheckerManager
|
||||||
|
from health_checker.sysmonitor import Sysmonitor
|
||||||
|
|
||||||
|
|
||||||
SYSLOG_IDENTIFIER = 'healthd'
|
SYSLOG_IDENTIFIER = 'healthd'
|
||||||
|
|
||||||
@ -75,6 +77,8 @@ class HealthDaemon(DaemonBase):
|
|||||||
if not manager.config.config_file_exists():
|
if not manager.config.config_file_exists():
|
||||||
self.log_warning("System health configuration file not found, exit...")
|
self.log_warning("System health configuration file not found, exit...")
|
||||||
return
|
return
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.task_run()
|
||||||
while 1:
|
while 1:
|
||||||
stat = manager.check(chassis)
|
stat = manager.check(chassis)
|
||||||
self._process_stat(chassis, manager.config, stat)
|
self._process_stat(chassis, manager.config, stat)
|
||||||
@ -85,6 +89,7 @@ class HealthDaemon(DaemonBase):
|
|||||||
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")
|
||||||
|
|
||||||
self.deinit()
|
self.deinit()
|
||||||
|
sysmon.task_stop()
|
||||||
|
|
||||||
def _process_stat(self, chassis, config, stat):
|
def _process_stat(self, chassis, config, stat):
|
||||||
from health_checker.health_checker import HealthChecker
|
from health_checker.health_checker import HealthChecker
|
||||||
|
@ -22,3 +22,12 @@ class MockConnector(object):
|
|||||||
|
|
||||||
def get_all(self, db_id, key):
|
def get_all(self, db_id, key):
|
||||||
return MockConnector.data[key]
|
return MockConnector.data[key]
|
||||||
|
|
||||||
|
def set(self, db_id, key, field, value):
|
||||||
|
self.data[key] = {}
|
||||||
|
self.data[key][field] = value
|
||||||
|
|
||||||
|
def hmset(self, db_id, key, fieldsvalues):
|
||||||
|
self.data[key] = {}
|
||||||
|
for field,value in fieldsvalues.items():
|
||||||
|
self.data[key][field] = value
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker
|
1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker
|
||||||
2. test_service_checker mocks the output of monit service and verify class ServiceChecker
|
2. test_service_checker mocks the output of monit service and verify class ServiceChecker
|
||||||
3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker
|
3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker
|
||||||
|
4. Mocks and tests the system ready status and verify class Sysmonitor
|
||||||
And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test.
|
And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test.
|
||||||
1. HealthDaemon
|
1. HealthDaemon
|
||||||
2. HealthCheckerManager
|
2. HealthCheckerManager
|
||||||
@ -30,6 +31,9 @@ from health_checker.health_checker import HealthChecker
|
|||||||
from health_checker.manager import HealthCheckerManager
|
from health_checker.manager import HealthCheckerManager
|
||||||
from health_checker.service_checker import ServiceChecker
|
from health_checker.service_checker import ServiceChecker
|
||||||
from health_checker.user_defined_checker import UserDefinedChecker
|
from health_checker.user_defined_checker import UserDefinedChecker
|
||||||
|
from health_checker.sysmonitor import Sysmonitor
|
||||||
|
from health_checker.sysmonitor import MonitorStateDbTask
|
||||||
|
from health_checker.sysmonitor import MonitorSystemBusTask
|
||||||
|
|
||||||
mock_supervisorctl_output = """
|
mock_supervisorctl_output = """
|
||||||
snmpd RUNNING pid 67, uptime 1:03:56
|
snmpd RUNNING pid 67, uptime 1:03:56
|
||||||
@ -505,3 +509,214 @@ def test_utils():
|
|||||||
|
|
||||||
output = utils.run_command('ls')
|
output = utils.run_command('ls')
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
|
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||||
|
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
|
||||||
|
@patch('docker.DockerClient')
|
||||||
|
@patch('health_checker.utils.run_command')
|
||||||
|
@patch('swsscommon.swsscommon.ConfigDBConnector')
|
||||||
|
def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client):
|
||||||
|
mock_db_data = MagicMock()
|
||||||
|
mock_get_table = MagicMock()
|
||||||
|
mock_db_data.get_table = mock_get_table
|
||||||
|
mock_config_db.return_value = mock_db_data
|
||||||
|
mock_get_table.return_value = {
|
||||||
|
'radv': {
|
||||||
|
'state': 'enabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
},
|
||||||
|
'bgp': {
|
||||||
|
'state': 'enabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
},
|
||||||
|
'pmon': {
|
||||||
|
'state': 'disabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
print("mock get table:{}".format(mock_get_table.return_value))
|
||||||
|
result = sysmon.get_all_service_list()
|
||||||
|
print("result get all service list:{}".format(result))
|
||||||
|
assert 'radv.service' in result
|
||||||
|
assert 'pmon.service' not in result
|
||||||
|
|
||||||
|
|
||||||
|
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
|
||||||
|
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
|
||||||
|
@patch('docker.DockerClient')
|
||||||
|
@patch('health_checker.utils.run_command')
|
||||||
|
@patch('swsscommon.swsscommon.ConfigDBConnector')
|
||||||
|
def test_get_app_ready_status(mock_config_db, mock_run, mock_docker_client):
|
||||||
|
mock_db_data = MagicMock()
|
||||||
|
mock_get_table = MagicMock()
|
||||||
|
mock_db_data.get_table = mock_get_table
|
||||||
|
mock_config_db.return_value = mock_db_data
|
||||||
|
mock_get_table.return_value = {
|
||||||
|
'radv': {
|
||||||
|
'state': 'enabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
'check_up_status': 'True'
|
||||||
|
},
|
||||||
|
'bgp': {
|
||||||
|
'state': 'enabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
'check_up_status': 'True'
|
||||||
|
},
|
||||||
|
'snmp': {
|
||||||
|
'state': 'enabled',
|
||||||
|
'has_global_scope': 'True',
|
||||||
|
'has_per_asic_scope': 'False',
|
||||||
|
'check_up_status': 'False'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MockConnector.data.update({
|
||||||
|
'FEATURE|radv': {
|
||||||
|
'up_status': 'True',
|
||||||
|
'fail_reason': '-',
|
||||||
|
'update_time': '-'
|
||||||
|
},
|
||||||
|
'FEATURE|bgp': {
|
||||||
|
'up_status': 'False',
|
||||||
|
'fail_reason': 'some error',
|
||||||
|
'update_time': '-'
|
||||||
|
}})
|
||||||
|
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
result = sysmon.get_app_ready_status('radv')
|
||||||
|
print(result)
|
||||||
|
assert 'Up' in result
|
||||||
|
result = sysmon.get_app_ready_status('bgp')
|
||||||
|
print(result)
|
||||||
|
assert 'Down' in result
|
||||||
|
result = sysmon.get_app_ready_status('snmp')
|
||||||
|
print(result)
|
||||||
|
assert 'Up' in result
|
||||||
|
|
||||||
|
|
||||||
|
mock_srv_props={
|
||||||
|
'mock_radv.service':{'Type': 'simple', 'Result': 'success', 'Id': 'mock_radv.service', 'LoadState': 'loaded', 'ActiveState': 'active', 'SubState': 'running', 'UnitFileState': 'enabled'},
|
||||||
|
'mock_bgp.service':{'Type': 'simple', 'Result': 'success', 'Id': 'mock_bgp.service', 'LoadState': 'loaded', 'ActiveState': 'inactive', 'SubState': 'dead', 'UnitFileState': 'enabled'}
|
||||||
|
}
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_bgp.service', 'mock_ns.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_bgp.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Down','-','-')))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock())
|
||||||
|
def test_check_unit_status():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.check_unit_status('mock_bgp.service')
|
||||||
|
assert 'mock_bgp.service' in sysmon.dnsrvs_name
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_radv.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Up','-','-')))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock())
|
||||||
|
def test_get_unit_status_ok():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
result = sysmon.get_unit_status('mock_radv.service')
|
||||||
|
print("get_unit_status:{}".format(result))
|
||||||
|
assert result == 'OK'
|
||||||
|
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_bgp.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Up','-','-')))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock())
|
||||||
|
def test_get_unit_status_not_ok():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
result = sysmon.get_unit_status('mock_bgp.service')
|
||||||
|
print("get_unit_status:{}".format(result))
|
||||||
|
assert result == 'NOT OK'
|
||||||
|
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_ns.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', MagicMock(return_value= 'OK'))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value='Up'))
|
||||||
|
def test_get_all_system_status_ok():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
result = sysmon.get_all_system_status()
|
||||||
|
print("result:{}".format(result))
|
||||||
|
assert result == 'UP'
|
||||||
|
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service', 'mock_ns.service']))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', MagicMock(return_value= 'NOT OK'))
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value='Up'))
|
||||||
|
def test_get_all_system_status_not_ok():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
result = sysmon.get_all_system_status()
|
||||||
|
print("result:{}".format(result))
|
||||||
|
assert result == 'DOWN'
|
||||||
|
|
||||||
|
def test_post_unit_status():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.post_unit_status("mock_bgp", 'OK', 'Down', 'mock reason', '-')
|
||||||
|
result = swsscommon.SonicV2Connector.get_all(MockConnector, 0, 'ALL_SERVICE_STATUS|mock_bgp')
|
||||||
|
print(result)
|
||||||
|
assert result['service_status'] == 'OK'
|
||||||
|
assert result['app_ready_status'] == 'Down'
|
||||||
|
assert result['fail_reason'] == 'mock reason'
|
||||||
|
|
||||||
|
def test_post_system_status():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.post_system_status("UP")
|
||||||
|
result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status')
|
||||||
|
print("post system status result:{}".format(result))
|
||||||
|
assert result == "UP"
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.post_system_status', test_post_system_status())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock())
|
||||||
|
def test_publish_system_status():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.publish_system_status('UP')
|
||||||
|
result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status')
|
||||||
|
assert result == "UP"
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', test_get_all_system_status_ok())
|
||||||
|
@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', test_publish_system_status())
|
||||||
|
def test_update_system_status():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.update_system_status()
|
||||||
|
result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status')
|
||||||
|
assert result == "UP"
|
||||||
|
|
||||||
|
from sonic_py_common.task_base import ProcessTaskBase
|
||||||
|
import multiprocessing
|
||||||
|
mpmgr = multiprocessing.Manager()
|
||||||
|
|
||||||
|
myQ = mpmgr.Queue()
|
||||||
|
def test_monitor_statedb_task():
|
||||||
|
sysmon = MonitorStateDbTask(myQ)
|
||||||
|
sysmon.SubscriberStateTable = MagicMock()
|
||||||
|
sysmon.task_run()
|
||||||
|
assert sysmon._task_process is not None
|
||||||
|
sysmon.task_stop()
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock())
|
||||||
|
def test_monitor_sysbus_task():
|
||||||
|
sysmon = MonitorSystemBusTask(myQ)
|
||||||
|
sysmon.SubscriberStateTable = MagicMock()
|
||||||
|
sysmon.task_run()
|
||||||
|
assert sysmon._task_process is not None
|
||||||
|
sysmon.task_stop()
|
||||||
|
|
||||||
|
@patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock())
|
||||||
|
@patch('health_checker.sysmonitor.MonitorStateDbTask.subscribe_statedb', MagicMock())
|
||||||
|
def test_system_service():
|
||||||
|
sysmon = Sysmonitor()
|
||||||
|
sysmon.task_run()
|
||||||
|
assert sysmon._task_process is not None
|
||||||
|
sysmon.task_stop()
|
||||||
|
Loading…
Reference in New Issue
Block a user