sysready (#8889)
This commit is contained in:
parent
8219975d6f
commit
d7e5372e54
@ -51,7 +51,27 @@
|
||||
{%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %}
|
||||
"set_owner": "kube", {% else %}
|
||||
"set_owner": "local", {% endif %} {% endif %}
|
||||
"high_mem_alert": "disabled"
|
||||
"high_mem_alert": "disabled",
|
||||
{%- if feature in ["bgp", "swss", "pmon", "nat", "teamd", "dhcp_relay", "sflow", "l2mcd", "udld", "stp", "snmp", "lldp", "radv", "iccpd", "syncd", "vrrp", "mgmt-framework", "tam"] %}
|
||||
"check_up_status" : "false"
|
||||
{%- else %}
|
||||
"check_up_status" : "false"
|
||||
{%- endif %}
|
||||
}{% if not loop.last %},{% endif -%}
|
||||
{% endfor %}
|
||||
},
|
||||
{%- set host_features = [("caclmgrd", false),
|
||||
("cron", false),
|
||||
("docker", false),
|
||||
("hostcfgd", false)] %}
|
||||
"HOST_FEATURE": {
|
||||
{%- for host_feature, check_up_status in host_features %}
|
||||
"{{host_feature}}": {
|
||||
{%- if host_feature in ["caclmgrd", "hostcfgd"] %}
|
||||
"check_up_status" : "false"
|
||||
{%- else %}
|
||||
"check_up_status" : "false"
|
||||
{%- endif %}
|
||||
}{% if not loop.last %},{% endif -%}
|
||||
{% endfor %}
|
||||
}
|
||||
|
@ -390,6 +390,14 @@ echo "ntp.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/finalize-warmboot.sh $FILESYSTEM_ROOT/usr/local/bin/finalize-warmboot.sh
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/warmboot-finalizer.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
||||
echo "warmboot-finalizer.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable warmboot-finalizer.service
|
||||
|
||||
# Copy sysmonitor files
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/sysmonitor/sysmonitor.py $FILESYSTEM_ROOT/usr/local/bin/sysmonitor.py
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/sysmonitor/sysmonitor.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
||||
echo "sysmonitor.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable sysmonitor.service
|
||||
|
||||
|
||||
# Copy watchdog-control files
|
||||
sudo LANG=C cp $IMAGE_CONFIGS/watchdog-control/watchdog-control.sh $FILESYSTEM_ROOT/usr/local/bin/watchdog-control.sh
|
||||
|
691
files/image_config/sysmonitor/sysmonitor.py
Executable file
691
files/image_config/sysmonitor/sysmonitor.py
Executable file
@ -0,0 +1,691 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from datetime import datetime
|
||||
import time
|
||||
import datetime
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import syslog
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from swsssdk import ConfigDBConnector
|
||||
from swsssdk import SonicV2Connector
|
||||
import socket
|
||||
import json
|
||||
import fcntl
|
||||
import stat
|
||||
|
||||
SYSLOG_IDENTIFIER="system#monitor"
|
||||
STATE_FEATURE_TABLE_NAME = "FEATURE"
|
||||
REDIS_TIMEOUT_MS = 0
|
||||
SYSTEM_STATE="DOWN"
|
||||
logger = None
|
||||
SYSTEM_CORESRV_STATE="DOWN"
|
||||
SYSTEM_ALLSRV_STATE="DOWN"
|
||||
SYSREADY_LOCKFILE="/var/run/sysready.lock"
|
||||
core_dnsrvs_name_list=[]
|
||||
dnsrvs_name_list=[]
|
||||
allsrvs_dict={}
|
||||
coresrvs_dict={}
|
||||
allsrvs_status="DOWN"
|
||||
coresrvs_status="DOWN"
|
||||
spl_srv_list= ['database-chassis', 'gbsyncd']
|
||||
core_srv_list = [
|
||||
'swss.service',
|
||||
'bgp.service',
|
||||
'teamd.service',
|
||||
'pmon.service',
|
||||
'syncd.service',
|
||||
'database.service',
|
||||
'mgmt-framework.service',
|
||||
]
|
||||
|
||||
class FileLock:
|
||||
def __init__(self, lock_file):
|
||||
self.f = open(lock_file, 'w')
|
||||
|
||||
def lock(self):
|
||||
fcntl.flock(self.f, fcntl.LOCK_EX)
|
||||
|
||||
def unlock(self):
|
||||
fcntl.flock(self.f, fcntl.LOCK_UN)
|
||||
|
||||
sysready_lock = FileLock(SYSREADY_LOCKFILE)
|
||||
|
||||
|
||||
class Logger(object):
|
||||
def __init__(self, syslog_identifier):
|
||||
syslog.openlog(ident=syslog_identifier, logoption=syslog.LOG_NDELAY, facility=syslog.LOG_DAEMON)
|
||||
|
||||
#def __del__(self):
|
||||
#syslog.closelog()
|
||||
|
||||
def log_emerg(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_EMERG, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_crit(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_CRIT, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_alert(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_ALERT, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
|
||||
def log_error(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_ERR, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_warning(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_WARNING, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_notice(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_NOTICE, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_info(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_INFO, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
def log_debug(self, msg, also_print_to_console=False):
|
||||
syslog.syslog(syslog.LOG_DEBUG, msg)
|
||||
|
||||
if also_print_to_console:
|
||||
print(msg)
|
||||
|
||||
#Initalise the syslog infrastructure
|
||||
logger = Logger(SYSLOG_IDENTIFIER)
|
||||
|
||||
class Dict2Obj(object):
|
||||
"""dict to dict2obj
|
||||
d: data"""
|
||||
|
||||
def __init__(self, d):
|
||||
for a, b in list(d.items()):
|
||||
if isinstance(b, (list, tuple)):
|
||||
setattr(self, a, [Dict2Obj(x) if isinstance(
|
||||
x, dict) else x for x in b])
|
||||
else:
|
||||
setattr(self, a, Dict2Obj(b) if isinstance(b, dict) else b)
|
||||
|
||||
|
||||
def print_console_message(message):
|
||||
with open('/dev/console', 'w') as console:
|
||||
console.write("\n{} {} \n ".format(datetime.datetime.now().strftime("%b %d %H:%M:%S.%f"), message))
|
||||
|
||||
def post_system_status_core(state, st_db):
|
||||
if st_db:
|
||||
st_db.set(st_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE_CORE", "status", state)
|
||||
|
||||
def post_system_status_all(state, st_db):
|
||||
if st_db:
|
||||
st_db.set(st_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE_ALL", "status", state)
|
||||
|
||||
def run_systemctl_show(service):
|
||||
a = subprocess.check_output(["systemctl", "show", service, "--property=Id,LoadState,UnitFileState,Type,ActiveState,SubState,Result"], universal_newlines=True).split('\n')
|
||||
json_dict = {}
|
||||
for e in a:
|
||||
kv = e.split("=", 1)
|
||||
if len(kv) == 2:
|
||||
json_dict[kv[0]] = kv[1]
|
||||
result = Dict2Obj(json_dict)
|
||||
return result
|
||||
|
||||
def get_all_service_list(config_db):
|
||||
dir_list=[]
|
||||
|
||||
#add the services from the below targets
|
||||
path= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"]
|
||||
for p in path:
|
||||
if os.path.exists(p):
|
||||
dir_list+= os.listdir(p)
|
||||
|
||||
#add the enabled docker services from config db feature table
|
||||
feature_table = config_db.get_table("FEATURE")
|
||||
for srv in feature_table.keys():
|
||||
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
|
||||
srvext=srv+".service"
|
||||
if srvext not in dir_list:
|
||||
dir_list.append(srvext)
|
||||
|
||||
#Keep ZTP in exclusion list
|
||||
exclude_list= ['aaastatsd.service', 'aaastatsd.timer' , 'rasdaemon.service', 'ztp.service', 'sonic.target', 'sonic-delayed.target']
|
||||
for l in exclude_list:
|
||||
if l in dir_list:
|
||||
dir_list.remove(l)
|
||||
|
||||
#sort it
|
||||
dir_list.sort()
|
||||
|
||||
return dir_list
|
||||
|
||||
|
||||
def get_app_ready_status(service, ap_db, st_db, config_db):
|
||||
#check FEATURE table from config db for the service' check_up_status flag
|
||||
#if marked to true, then read the service up_status field from FEATURE table of state db.
|
||||
#else, just return true (or) Up
|
||||
fail_reason=""
|
||||
configdb_feature_table = config_db.get_table('FEATURE')
|
||||
configdb_host_feature_table = config_db.get_table('HOST_FEATURE')
|
||||
service_name = service
|
||||
|
||||
if service_name not in configdb_feature_table.keys() and service_name not in configdb_host_feature_table.keys():
|
||||
pstate = "Up"
|
||||
else:
|
||||
if service_name in configdb_feature_table.keys():
|
||||
check_app_up_status = configdb_feature_table[service_name].get('check_up_status')
|
||||
elif service_name in configdb_host_feature_table.keys():
|
||||
check_app_up_status = configdb_host_feature_table[service_name].get('check_up_status')
|
||||
|
||||
if check_app_up_status == "true":
|
||||
up_status_flag = st_db.get(st_db.STATE_DB, 'FEATURE|{}'.format(service_name), 'up_status')
|
||||
if up_status_flag == "true":
|
||||
pstate = "Up"
|
||||
else:
|
||||
fail_reason = st_db.get(st_db.STATE_DB, 'FEATURE|{}'.format(service_name), 'fail_reason')
|
||||
if fail_reason is None:
|
||||
fail_reason = "NA"
|
||||
pstate = "Down"
|
||||
else:
|
||||
#Either check_up_status marked false or entry does not exist
|
||||
pstate = "Up"
|
||||
|
||||
return pstate,fail_reason
|
||||
|
||||
|
||||
def get_unit_status(event, ap_db, st_db, config_db):
|
||||
""" Get a unit status"""
|
||||
global coresrvs_dict
|
||||
global core_srv_list
|
||||
global allsrvs_dict
|
||||
global spl_srv_list
|
||||
unit_status = "NOTOK"
|
||||
fail_reason="Unknown"
|
||||
try:
|
||||
service_status = "Not OK"
|
||||
service_up_status = "Not OK"
|
||||
service_name,last_name = event.split('.')
|
||||
sysctl_show = run_systemctl_show(event)
|
||||
load_state = sysctl_show.LoadState
|
||||
if load_state == "loaded":
|
||||
status = sysctl_show.UnitFileState
|
||||
fail_reason = sysctl_show.Result
|
||||
active_state = sysctl_show.ActiveState
|
||||
sub_state = sysctl_show.SubState
|
||||
srv_type = sysctl_show.Type
|
||||
|
||||
#Raise syslog for service state change
|
||||
logger.log_info("{} service state changed to [{}/{}]".format(event, active_state, sub_state))
|
||||
|
||||
if status == "enabled" or status == "enabled-runtime" or status == "static":
|
||||
if fail_reason == "success":
|
||||
fail_reason = "-"
|
||||
if (active_state == "active" and sub_state == "exited"):
|
||||
service_status = "OK"
|
||||
service_up_status = "OK"
|
||||
unit_status = "OK"
|
||||
elif active_state == "active" and sub_state == "running":
|
||||
service_status = "OK"
|
||||
init_state,app_fail_reason = get_app_ready_status(service_name, ap_db, st_db, config_db)
|
||||
if init_state == "Up":
|
||||
service_up_status = "OK"
|
||||
unit_status = "OK"
|
||||
else:
|
||||
fail_reason = app_fail_reason
|
||||
unit_status = "NOTOK"
|
||||
if fail_reason == "docker start":
|
||||
service_up_status = "Starting"
|
||||
fail_reason = "-"
|
||||
elif active_state == "activating":
|
||||
service_status = "Starting"
|
||||
service_up_status = "Starting"
|
||||
elif active_state == "deactivating":
|
||||
service_status = "Stopping"
|
||||
service_up_status = "Stopping"
|
||||
elif active_state == "inactive":
|
||||
if srv_type == "oneshot" or service_name in spl_srv_list:
|
||||
service_status = "OK"
|
||||
service_up_status = "OK"
|
||||
unit_status = "OK"
|
||||
else:
|
||||
unit_status = "NOTOK"
|
||||
if fail_reason == "-":
|
||||
fail_reason = "Inactive"
|
||||
else:
|
||||
unit_status = "NOTOK"
|
||||
|
||||
if event in core_srv_list:
|
||||
coresrvs_dict[service_name] = {"service_status":service_status, "service_up_status":service_up_status, "fail_reason":fail_reason}
|
||||
|
||||
allsrvs_dict[service_name] = {"service_status":service_status, "service_up_status":service_up_status, "fail_reason":fail_reason}
|
||||
|
||||
return unit_status
|
||||
|
||||
except Exception as e:
|
||||
logger.log_error("Get unit status {}-{}".format(service_name, str(e)))
|
||||
|
||||
|
||||
def get_all_system_status(ap_db, st_db, config_db):
|
||||
""" Shows the system ready status"""
|
||||
global dnsrvs_name_list
|
||||
global allsrvs_status
|
||||
scan_srv_list=[]
|
||||
overall_ok_flag = 1
|
||||
|
||||
scan_srv_list=get_all_service_list(config_db)
|
||||
#logger.log_info("scan_srv_list:[{}]".format(scan_srv_list))
|
||||
|
||||
for service in scan_srv_list:
|
||||
ustate = get_unit_status(service,ap_db,st_db,config_db)
|
||||
if ustate == "NOTOK":
|
||||
overall_ok_flag &= 0
|
||||
dnsrvs_name_list.append(service)
|
||||
|
||||
if overall_ok_flag == 1:
|
||||
allsrvs_status = "UP"
|
||||
return ("UP", "System is ready with all the services")
|
||||
else:
|
||||
allsrvs_status = "DOWN"
|
||||
return ("DOWN", "System is not ready - one or more services are not up")
|
||||
|
||||
|
||||
def get_core_system_status(ap_db, st_db,config_db):
|
||||
""" Shows the core system ready status"""
|
||||
global core_srv_list
|
||||
global core_dnsrvs_name_list
|
||||
global coresrvs_status
|
||||
core_ok_flag = 1
|
||||
|
||||
for service in core_srv_list:
|
||||
ustate = get_unit_status(service,ap_db,st_db,config_db)
|
||||
if ustate == "NOTOK":
|
||||
core_ok_flag &= 0
|
||||
core_dnsrvs_name_list.append(service)
|
||||
|
||||
if core_ok_flag == 1:
|
||||
coresrvs_status = "UP"
|
||||
return ("UP", "System is ready")
|
||||
else:
|
||||
coresrvs_status = "DOWN"
|
||||
return ("DOWN", "System is not ready - core services are not ok")
|
||||
|
||||
|
||||
#Checks current system status
|
||||
def check_system_status(event, st_db, ap_db, config_db):
|
||||
global SYSTEM_STATE
|
||||
(cstate, msg) = get_core_system_status(ap_db, st_db,config_db)
|
||||
if SYSTEM_STATE != cstate:
|
||||
SYSTEM_STATE=cstate
|
||||
logger.log_notice(msg)
|
||||
print_console_message(msg)
|
||||
post_system_status_core(cstate, st_db)
|
||||
|
||||
|
||||
global SYSTEM_ALLSRV_STATE
|
||||
(astate, msg) = get_all_system_status(ap_db, st_db, config_db)
|
||||
if SYSTEM_ALLSRV_STATE != astate:
|
||||
SYSTEM_ALLSRV_STATE=astate
|
||||
logger.log_info(msg)
|
||||
print_console_message(msg)
|
||||
post_system_status_all(astate, st_db)
|
||||
|
||||
#Checks the unit status and updates the system status
|
||||
def check_unit_status(event, st_db, ap_db, config_db):
|
||||
global SYSTEM_STATE
|
||||
global SYSTEM_ALLSRV_STATE
|
||||
global core_dnsrvs_name_list
|
||||
global dnsrvs_name_list
|
||||
global core_srv_list
|
||||
global coresrvs_status
|
||||
global allsrvs_status
|
||||
global allsrvs_dict
|
||||
|
||||
#astate="DOWN"
|
||||
#cstate="DOWN"
|
||||
#msg=""
|
||||
|
||||
#check for core status
|
||||
if event in core_srv_list:
|
||||
ustate = get_unit_status(event,ap_db,st_db,config_db)
|
||||
if ustate == "OK" and SYSTEM_STATE == "UP":
|
||||
cstate = "UP"
|
||||
elif ustate == "OK" and SYSTEM_STATE == "DOWN":
|
||||
if event in core_dnsrvs_name_list:
|
||||
core_dnsrvs_name_list.remove(event)
|
||||
#need to check if need to set cstate to UP if this was the only down service before, which became UP now.
|
||||
if len(core_dnsrvs_name_list) == 0:
|
||||
cstate = "UP"
|
||||
else:
|
||||
cstate = "DOWN"
|
||||
else:
|
||||
if event not in core_dnsrvs_name_list:
|
||||
core_dnsrvs_name_list.append(event)
|
||||
cstate = "DOWN"
|
||||
|
||||
if cstate == "DOWN":
|
||||
msg = "System is not ready - core services are not ok"
|
||||
coresrvs_status = "DOWN"
|
||||
else:
|
||||
msg = "System is ready with core services"
|
||||
coresrvs_status = "UP"
|
||||
|
||||
#logger.log_info("core - event:{} ustate:{} cstate:{} dnsrv:{}".format(event,ustate,cstate,core_dnsrvs_name_list))
|
||||
|
||||
if SYSTEM_STATE != cstate:
|
||||
SYSTEM_STATE=cstate
|
||||
logger.log_notice(msg)
|
||||
print_console_message(msg)
|
||||
post_system_status_core(cstate, st_db)
|
||||
|
||||
#check for all status
|
||||
full_srv_list=get_all_service_list(config_db)
|
||||
#logger.log_info("full srv list:{}".format(full_srv_list))
|
||||
if event in full_srv_list:
|
||||
ustate = get_unit_status(event,ap_db,st_db,config_db)
|
||||
if ustate == "OK" and SYSTEM_ALLSRV_STATE == "UP":
|
||||
astate = "UP"
|
||||
elif ustate == "OK" and SYSTEM_ALLSRV_STATE == "DOWN":
|
||||
if event in dnsrvs_name_list:
|
||||
dnsrvs_name_list.remove(event)
|
||||
#need to check if need to set cstate to UP if this was the only down service before, which became UP now.
|
||||
if len(dnsrvs_name_list) == 0:
|
||||
astate = "UP"
|
||||
else:
|
||||
astate = "DOWN"
|
||||
else:
|
||||
if event not in dnsrvs_name_list:
|
||||
dnsrvs_name_list.append(event)
|
||||
astate = "DOWN"
|
||||
|
||||
if astate == "DOWN":
|
||||
msg = "System is not ready - one or more services are not ok"
|
||||
allsrvs_status = "DOWN"
|
||||
else:
|
||||
msg = "System is ready with all the services"
|
||||
allsrvs_status = "UP"
|
||||
|
||||
#logger.log_info("all - event:{} ustate:{} astate:{} dnsrvs:{}".format(event,ustate,astate,dnsrvs_name_list))
|
||||
|
||||
if SYSTEM_ALLSRV_STATE != astate:
|
||||
SYSTEM_ALLSRV_STATE=astate
|
||||
logger.log_info(msg)
|
||||
print_console_message(msg)
|
||||
post_system_status_all(astate, st_db)
|
||||
|
||||
else:
|
||||
#if received event is not in current full service list but exists in global dictionary & list, then it should be removed from dictionary & list
|
||||
srv_name,last_name = event.split('.')
|
||||
if allsrvs_dict.__contains__(srv_name):
|
||||
allsrvs_dict.pop(srv_name)
|
||||
|
||||
#also remove from dnsrvslist
|
||||
if event in dnsrvs_name_list:
|
||||
dnsrvs_name_list.remove(event)
|
||||
|
||||
|
||||
|
||||
|
||||
##############################################################
|
||||
# Listen for STATEDB state event #
|
||||
##############################################################
|
||||
|
||||
def subscribe_statedb(queue):
|
||||
from swsscommon import swsscommon
|
||||
|
||||
while True:
|
||||
try:
|
||||
logger.log_info( "Listening for StateDB event, Pid:{}".format(os.getpid()))
|
||||
SELECT_TIMEOUT_MS = 1000 * 2
|
||||
|
||||
db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, True)
|
||||
sel = swsscommon.Select()
|
||||
cst = swsscommon.SubscriberStateTable(db, STATE_FEATURE_TABLE_NAME)
|
||||
sel.addSelectable(cst)
|
||||
|
||||
while True:
|
||||
(state, c) = sel.select(SELECT_TIMEOUT_MS)
|
||||
if state == swsscommon.Select.OBJECT:
|
||||
(key, op, cfvs) = cst.pop()
|
||||
#logger.log_info(key+"featureevent")
|
||||
key_ext = key+".service"
|
||||
queue.put(key_ext)
|
||||
except Exception as e:
|
||||
logger.log_error( str(e))
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def subscribe_statedb_event_thread(queue):
|
||||
while True:
|
||||
try:
|
||||
process_statedb_event = mp.Process(target=subscribe_statedb, args=(queue,) )
|
||||
process_statedb_event.start()
|
||||
process_statedb_event.join()
|
||||
except Exception as e:
|
||||
logger.log_error( str(e))
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
##############################################################
|
||||
# Listening for System service event #
|
||||
##############################################################
|
||||
|
||||
QUEUE=None
|
||||
def OnJobRemoved(id, job, unit, result):
|
||||
|
||||
global QUEUE
|
||||
|
||||
#logger.log_debug('{}: Job Removed: {}, {}, {} '.format( id, job, unit, result))
|
||||
if result == "done":
|
||||
QUEUE.put(unit)
|
||||
return
|
||||
|
||||
|
||||
#Sub process for listening the systemd event on dbus
|
||||
def subscribe_service_event(queue):
|
||||
import dbus
|
||||
from gi.repository import GObject
|
||||
from dbus.mainloop.glib import DBusGMainLoop
|
||||
|
||||
#logger.log_info( "Listening for systemd service event, Pid:{}".format(os.getpid()))
|
||||
DBusGMainLoop(set_as_default=True)
|
||||
|
||||
bus = dbus.SystemBus()
|
||||
systemd = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1')
|
||||
manager = dbus.Interface(systemd, 'org.freedesktop.systemd1.Manager')
|
||||
|
||||
manager.Subscribe()
|
||||
manager.connect_to_signal('JobRemoved', OnJobRemoved)
|
||||
|
||||
loop = GObject.MainLoop()
|
||||
loop.run()
|
||||
|
||||
|
||||
#Start the subprocess to listen the systemd service state change event
|
||||
def subscribe_service_event_thread(queue):
|
||||
retry_count=0
|
||||
while True:
|
||||
try:
|
||||
process_service_event = mp.Process(target=subscribe_service_event, args=(queue,) )
|
||||
process_service_event.start()
|
||||
process_service_event.join()
|
||||
except Exception as e:
|
||||
logger.log_error( str(e))
|
||||
|
||||
time.sleep(60)
|
||||
retry_count+=1
|
||||
if retry_count > 10:
|
||||
logger.log_error("dbus subscription for systemd1 failed multiple times, exiting the subscription")
|
||||
break
|
||||
|
||||
|
||||
def status_core(req):
|
||||
"""shows the system status core"""
|
||||
global coresrvs_dict
|
||||
global coresrvs_status
|
||||
coresrvs=""
|
||||
|
||||
sysready_lock.lock()
|
||||
if coresrvs_status == "UP":
|
||||
msg = "System is ready with core services"
|
||||
else:
|
||||
msg = "System is not ready - core services are not ok"
|
||||
|
||||
coresrvs+="{:30s} {:20s} {:20s} {:20s}\n".format("Service-Name","Service-Status","App-Ready-Status", "Fail-Reason")
|
||||
for srv in coresrvs_dict.keys():
|
||||
coresrvs+="{:30s} {:20s} {:20s} {:20s}\n".format(srv, coresrvs_dict[srv]['service_status'],
|
||||
coresrvs_dict[srv]['service_up_status'],
|
||||
coresrvs_dict[srv]['fail_reason'])
|
||||
sysready_lock.unlock()
|
||||
|
||||
return {"status":msg, "coresrvs":coresrvs}
|
||||
|
||||
|
||||
def status_all(req):
|
||||
"""shows the system status all"""
|
||||
global allsrvs_dict
|
||||
global allsrvs_status
|
||||
global dnsrvs_name_list
|
||||
str1=" "
|
||||
allsrvs=""
|
||||
dnsrvs_name=""
|
||||
|
||||
sysready_lock.lock()
|
||||
if allsrvs_status == "UP":
|
||||
msg = "System is ready with all the services"
|
||||
else:
|
||||
msg = "System is not ready - one or more services are not ok"
|
||||
|
||||
allsrvs+="{:30s} {:20s} {:20s} {:20s}\n".format("Service-Name","Service-Status","App-Ready-Status", "Fail-Reason")
|
||||
for srv in allsrvs_dict.keys():
|
||||
allsrvs+="{:30s} {:20s} {:20s} {:20s}\n".format(srv, allsrvs_dict[srv]['service_status'],
|
||||
allsrvs_dict[srv]['service_up_status'],
|
||||
allsrvs_dict[srv]['fail_reason'])
|
||||
|
||||
dnsrvs_name=str1.join(dnsrvs_name_list)
|
||||
sysready_lock.unlock()
|
||||
|
||||
return {"status":msg, "allsrvs":allsrvs, "dnsrvs_name":dnsrvs_name}
|
||||
|
||||
|
||||
def sysready_listen():
|
||||
SERVER_ADDRESS = '/var/run/sysready.socket'
|
||||
|
||||
if os.path.exists(SERVER_ADDRESS):
|
||||
os.remove(SERVER_ADDRESS)
|
||||
|
||||
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
sock.bind(SERVER_ADDRESS)
|
||||
os.chmod(SERVER_ADDRESS, stat.S_IRWXU |stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)
|
||||
sock.listen(1)
|
||||
fail_res={"status":False, "msg":None}
|
||||
while True:
|
||||
connection, client_address = sock.accept()
|
||||
try:
|
||||
request = connection.recv(10240)
|
||||
#logger.log_info("sysready [ REQ ] {}".format(request))
|
||||
if request is None:
|
||||
continue
|
||||
|
||||
req=Dict2Obj(json.loads(request.decode('utf-8')))
|
||||
|
||||
response = globals()[req.command](req)
|
||||
res=json.dumps(response)
|
||||
#logger.log_info("sysready [ RES ] {}".format(res))
|
||||
connection.sendall(res.encode('utf-8'))
|
||||
except Exception as e:
|
||||
logger.log_error("sysready {}".format(str(e)))
|
||||
fail_res['msg']=str(e)
|
||||
connection.sendall(json.dumps(fail_res).encode('utf-8'))
|
||||
|
||||
connection.close()
|
||||
|
||||
#sock.close() #lgtm [py/unreachable-statement]
|
||||
|
||||
|
||||
def db_connect():
|
||||
try:
|
||||
st_db = SonicV2Connector()
|
||||
st_db.connect(st_db.STATE_DB,True)
|
||||
ap_db = SonicV2Connector()
|
||||
ap_db.connect(ap_db.APPL_DB,True)
|
||||
config_db = ConfigDBConnector()
|
||||
config_db.connect()
|
||||
except Exception as e:
|
||||
logger.log_error("Error: Connection to the DB failed {}".format(str(e)))
|
||||
sys.exit(1)
|
||||
|
||||
return st_db,ap_db,config_db
|
||||
|
||||
|
||||
def system_service():
|
||||
|
||||
global QUEUE
|
||||
QUEUE = mp.Queue()
|
||||
|
||||
st_db,ap_db,config_db = db_connect()
|
||||
|
||||
thread_service_event = threading.Thread(target=subscribe_service_event_thread, name='service', args=(QUEUE,))
|
||||
thread_service_event.start()
|
||||
|
||||
thread_sysready = threading.Thread(target=sysready_listen, name='sysready', args=())
|
||||
thread_sysready.start()
|
||||
|
||||
thread_statedb = threading.Thread(target=subscribe_statedb_event_thread, name='statedb', args=(QUEUE,))
|
||||
thread_statedb.start()
|
||||
|
||||
event = 'SERVICE_EVENT'
|
||||
sysready_lock.lock()
|
||||
#This is run only once when sysmonitor bootsup
|
||||
check_system_status(event, st_db, ap_db, config_db)
|
||||
sysready_lock.unlock()
|
||||
|
||||
# Queue to receive the STATEDB and Systemd state change event
|
||||
while True:
|
||||
event = QUEUE.get()
|
||||
#logger.log_info( "System event [ "+event+" ] is received")
|
||||
try:
|
||||
sysready_lock.lock()
|
||||
check_unit_status(event, st_db, ap_db, config_db)
|
||||
sysready_lock.unlock()
|
||||
except Exception as e:
|
||||
logger.log_error( str(e))
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
#Main method to lanch the process in background
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--daemon", action='store_true', help="Start with daemon mode")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.daemon:
|
||||
try:
|
||||
pid = os.fork()
|
||||
except OSError:
|
||||
logger.log_error("Could not create a child process\n")
|
||||
#parent
|
||||
if pid != 0:
|
||||
exit()
|
||||
|
||||
system_service()
|
||||
|
17
files/image_config/sysmonitor/sysmonitor.service
Normal file
17
files/image_config/sysmonitor/sysmonitor.service
Normal file
@ -0,0 +1,17 @@
|
||||
[Unit]
|
||||
Description=Watchdog service for system services and app readiness
|
||||
Requires=database.service
|
||||
After=database.service
|
||||
StartLimitIntervalSec=60
|
||||
StartLimitBurst=3
|
||||
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
Restart=always
|
||||
ExecStart=/usr/local/bin/sysmonitor.py --daemon
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -6,6 +6,7 @@ After=rc-local.service database.service
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/determine-reboot-cause
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -66,6 +66,37 @@ module sonic-feature{
|
||||
type stypes:feature_state;
|
||||
default "disabled";
|
||||
}
|
||||
|
||||
leaf check_up_status {
|
||||
description "This configuration controls the system ready tool to check
|
||||
the app ready/up status";
|
||||
type boolean;
|
||||
default false;
|
||||
}
|
||||
}
|
||||
}
|
||||
container HOST_FEATURE {
|
||||
|
||||
description "host feature table in config_db.json";
|
||||
|
||||
list HOST_FEATURE_LIST {
|
||||
|
||||
key "name";
|
||||
|
||||
leaf name {
|
||||
description "host feature name in Host Feature table
|
||||
Example - caclmgrd, hostcfgd";
|
||||
type string {
|
||||
length 1..32;
|
||||
}
|
||||
}
|
||||
|
||||
leaf check_up_status {
|
||||
description "This configuration controls the system ready tool to check
|
||||
the app ready/up status";
|
||||
type boolean;
|
||||
default false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user