parent
d4bc0095a5
commit
5f11eb320e
@ -51,27 +51,7 @@
|
|||||||
{%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %}
|
{%- if feature in ["lldp", "pmon", "radv", "snmp", "telemetry"] %}
|
||||||
"set_owner": "kube", {% else %}
|
"set_owner": "kube", {% else %}
|
||||||
"set_owner": "local", {% endif %} {% endif %}
|
"set_owner": "local", {% endif %} {% endif %}
|
||||||
"high_mem_alert": "disabled",
|
"high_mem_alert": "disabled"
|
||||||
{%- if feature in ["bgp", "swss", "pmon", "nat", "teamd", "dhcp_relay", "sflow", "l2mcd", "udld", "stp", "snmp", "lldp", "radv", "iccpd", "syncd", "vrrp", "mgmt-framework", "tam"] %}
|
|
||||||
"check_up_status" : "false"
|
|
||||||
{%- else %}
|
|
||||||
"check_up_status" : "false"
|
|
||||||
{%- endif %}
|
|
||||||
}{% if not loop.last %},{% endif -%}
|
|
||||||
{% endfor %}
|
|
||||||
},
|
|
||||||
{%- set host_features = [("caclmgrd", false),
|
|
||||||
("cron", false),
|
|
||||||
("docker", false),
|
|
||||||
("hostcfgd", false)] %}
|
|
||||||
"HOST_FEATURE": {
|
|
||||||
{%- for host_feature, check_up_status in host_features %}
|
|
||||||
"{{host_feature}}": {
|
|
||||||
{%- if host_feature in ["caclmgrd", "hostcfgd"] %}
|
|
||||||
"check_up_status" : "false"
|
|
||||||
{%- else %}
|
|
||||||
"check_up_status" : "false"
|
|
||||||
{%- endif %}
|
|
||||||
}{% if not loop.last %},{% endif -%}
|
}{% if not loop.last %},{% endif -%}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
}
|
}
|
||||||
|
@ -346,14 +346,6 @@ echo "ntp.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
|||||||
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/finalize-warmboot.sh $FILESYSTEM_ROOT/usr/local/bin/finalize-warmboot.sh
|
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/finalize-warmboot.sh $FILESYSTEM_ROOT/usr/local/bin/finalize-warmboot.sh
|
||||||
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/warmboot-finalizer.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
sudo LANG=C cp $IMAGE_CONFIGS/warmboot-finalizer/warmboot-finalizer.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
||||||
echo "warmboot-finalizer.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
echo "warmboot-finalizer.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
||||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable warmboot-finalizer.service
|
|
||||||
|
|
||||||
# Copy sysmonitor files
|
|
||||||
sudo LANG=C cp $IMAGE_CONFIGS/sysmonitor/sysmonitor.py $FILESYSTEM_ROOT/usr/local/bin/sysmonitor.py
|
|
||||||
sudo LANG=C cp $IMAGE_CONFIGS/sysmonitor/sysmonitor.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
|
|
||||||
echo "sysmonitor.service" | sudo tee -a $GENERATED_SERVICE_FILE
|
|
||||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable sysmonitor.service
|
|
||||||
|
|
||||||
|
|
||||||
# Copy watchdog-control files
|
# Copy watchdog-control files
|
||||||
sudo LANG=C cp $IMAGE_CONFIGS/watchdog-control/watchdog-control.sh $FILESYSTEM_ROOT/usr/local/bin/watchdog-control.sh
|
sudo LANG=C cp $IMAGE_CONFIGS/watchdog-control/watchdog-control.sh $FILESYSTEM_ROOT/usr/local/bin/watchdog-control.sh
|
||||||
|
@ -1,691 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import threading
|
|
||||||
import syslog
|
|
||||||
import argparse
|
|
||||||
import multiprocessing as mp
|
|
||||||
from swsssdk import ConfigDBConnector
|
|
||||||
from swsssdk import SonicV2Connector
|
|
||||||
import socket
|
|
||||||
import json
|
|
||||||
import fcntl
|
|
||||||
import stat
|
|
||||||
|
|
||||||
SYSLOG_IDENTIFIER="system#monitor"
|
|
||||||
STATE_FEATURE_TABLE_NAME = "FEATURE"
|
|
||||||
REDIS_TIMEOUT_MS = 0
|
|
||||||
SYSTEM_STATE="DOWN"
|
|
||||||
logger = None
|
|
||||||
SYSTEM_CORESRV_STATE="DOWN"
|
|
||||||
SYSTEM_ALLSRV_STATE="DOWN"
|
|
||||||
SYSREADY_LOCKFILE="/var/run/sysready.lock"
|
|
||||||
core_dnsrvs_name_list=[]
|
|
||||||
dnsrvs_name_list=[]
|
|
||||||
allsrvs_dict={}
|
|
||||||
coresrvs_dict={}
|
|
||||||
allsrvs_status="DOWN"
|
|
||||||
coresrvs_status="DOWN"
|
|
||||||
spl_srv_list= ['database-chassis', 'gbsyncd']
|
|
||||||
core_srv_list = [
|
|
||||||
'swss.service',
|
|
||||||
'bgp.service',
|
|
||||||
'teamd.service',
|
|
||||||
'pmon.service',
|
|
||||||
'syncd.service',
|
|
||||||
'database.service',
|
|
||||||
'mgmt-framework.service',
|
|
||||||
]
|
|
||||||
|
|
||||||
class FileLock:
|
|
||||||
def __init__(self, lock_file):
|
|
||||||
self.f = open(lock_file, 'w')
|
|
||||||
|
|
||||||
def lock(self):
|
|
||||||
fcntl.flock(self.f, fcntl.LOCK_EX)
|
|
||||||
|
|
||||||
def unlock(self):
|
|
||||||
fcntl.flock(self.f, fcntl.LOCK_UN)
|
|
||||||
|
|
||||||
sysready_lock = FileLock(SYSREADY_LOCKFILE)
|
|
||||||
|
|
||||||
|
|
||||||
class Logger(object):
|
|
||||||
def __init__(self, syslog_identifier):
|
|
||||||
syslog.openlog(ident=syslog_identifier, logoption=syslog.LOG_NDELAY, facility=syslog.LOG_DAEMON)
|
|
||||||
|
|
||||||
#def __del__(self):
|
|
||||||
#syslog.closelog()
|
|
||||||
|
|
||||||
def log_emerg(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_EMERG, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_crit(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_CRIT, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_alert(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_ALERT, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def log_error(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_ERR, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_warning(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_WARNING, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_notice(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_NOTICE, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_info(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_INFO, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
def log_debug(self, msg, also_print_to_console=False):
|
|
||||||
syslog.syslog(syslog.LOG_DEBUG, msg)
|
|
||||||
|
|
||||||
if also_print_to_console:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
#Initalise the syslog infrastructure
|
|
||||||
logger = Logger(SYSLOG_IDENTIFIER)
|
|
||||||
|
|
||||||
class Dict2Obj(object):
|
|
||||||
"""dict to dict2obj
|
|
||||||
d: data"""
|
|
||||||
|
|
||||||
def __init__(self, d):
|
|
||||||
for a, b in list(d.items()):
|
|
||||||
if isinstance(b, (list, tuple)):
|
|
||||||
setattr(self, a, [Dict2Obj(x) if isinstance(
|
|
||||||
x, dict) else x for x in b])
|
|
||||||
else:
|
|
||||||
setattr(self, a, Dict2Obj(b) if isinstance(b, dict) else b)
|
|
||||||
|
|
||||||
|
|
||||||
def print_console_message(message):
|
|
||||||
with open('/dev/console', 'w') as console:
|
|
||||||
console.write("\n{} {} \n ".format(datetime.datetime.now().strftime("%b %d %H:%M:%S.%f"), message))
|
|
||||||
|
|
||||||
def post_system_status_core(state, st_db):
|
|
||||||
if st_db:
|
|
||||||
st_db.set(st_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE_CORE", "status", state)
|
|
||||||
|
|
||||||
def post_system_status_all(state, st_db):
|
|
||||||
if st_db:
|
|
||||||
st_db.set(st_db.STATE_DB, "SYSTEM_READY|SYSTEM_STATE_ALL", "status", state)
|
|
||||||
|
|
||||||
def run_systemctl_show(service):
|
|
||||||
a = subprocess.check_output(["systemctl", "show", service, "--property=Id,LoadState,UnitFileState,Type,ActiveState,SubState,Result"], universal_newlines=True).split('\n')
|
|
||||||
json_dict = {}
|
|
||||||
for e in a:
|
|
||||||
kv = e.split("=", 1)
|
|
||||||
if len(kv) == 2:
|
|
||||||
json_dict[kv[0]] = kv[1]
|
|
||||||
result = Dict2Obj(json_dict)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_all_service_list(config_db):
|
|
||||||
dir_list=[]
|
|
||||||
|
|
||||||
#add the services from the below targets
|
|
||||||
path= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"]
|
|
||||||
for p in path:
|
|
||||||
if os.path.exists(p):
|
|
||||||
dir_list+= os.listdir(p)
|
|
||||||
|
|
||||||
#add the enabled docker services from config db feature table
|
|
||||||
feature_table = config_db.get_table("FEATURE")
|
|
||||||
for srv in feature_table.keys():
|
|
||||||
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
|
|
||||||
srvext=srv+".service"
|
|
||||||
if srvext not in dir_list:
|
|
||||||
dir_list.append(srvext)
|
|
||||||
|
|
||||||
#Keep ZTP in exclusion list
|
|
||||||
exclude_list= ['aaastatsd.service', 'aaastatsd.timer' , 'rasdaemon.service', 'ztp.service', 'sonic.target', 'sonic-delayed.target']
|
|
||||||
for l in exclude_list:
|
|
||||||
if l in dir_list:
|
|
||||||
dir_list.remove(l)
|
|
||||||
|
|
||||||
#sort it
|
|
||||||
dir_list.sort()
|
|
||||||
|
|
||||||
return dir_list
|
|
||||||
|
|
||||||
|
|
||||||
def get_app_ready_status(service, ap_db, st_db, config_db):
|
|
||||||
#check FEATURE table from config db for the service' check_up_status flag
|
|
||||||
#if marked to true, then read the service up_status field from FEATURE table of state db.
|
|
||||||
#else, just return true (or) Up
|
|
||||||
fail_reason=""
|
|
||||||
configdb_feature_table = config_db.get_table('FEATURE')
|
|
||||||
configdb_host_feature_table = config_db.get_table('HOST_FEATURE')
|
|
||||||
service_name = service
|
|
||||||
|
|
||||||
if service_name not in configdb_feature_table.keys() and service_name not in configdb_host_feature_table.keys():
|
|
||||||
pstate = "Up"
|
|
||||||
else:
|
|
||||||
if service_name in configdb_feature_table.keys():
|
|
||||||
check_app_up_status = configdb_feature_table[service_name].get('check_up_status')
|
|
||||||
elif service_name in configdb_host_feature_table.keys():
|
|
||||||
check_app_up_status = configdb_host_feature_table[service_name].get('check_up_status')
|
|
||||||
|
|
||||||
if check_app_up_status == "true":
|
|
||||||
up_status_flag = st_db.get(st_db.STATE_DB, 'FEATURE|{}'.format(service_name), 'up_status')
|
|
||||||
if up_status_flag == "true":
|
|
||||||
pstate = "Up"
|
|
||||||
else:
|
|
||||||
fail_reason = st_db.get(st_db.STATE_DB, 'FEATURE|{}'.format(service_name), 'fail_reason')
|
|
||||||
if fail_reason is None:
|
|
||||||
fail_reason = "NA"
|
|
||||||
pstate = "Down"
|
|
||||||
else:
|
|
||||||
#Either check_up_status marked false or entry does not exist
|
|
||||||
pstate = "Up"
|
|
||||||
|
|
||||||
return pstate,fail_reason
|
|
||||||
|
|
||||||
|
|
||||||
def get_unit_status(event, ap_db, st_db, config_db):
|
|
||||||
""" Get a unit status"""
|
|
||||||
global coresrvs_dict
|
|
||||||
global core_srv_list
|
|
||||||
global allsrvs_dict
|
|
||||||
global spl_srv_list
|
|
||||||
unit_status = "NOTOK"
|
|
||||||
fail_reason="Unknown"
|
|
||||||
try:
|
|
||||||
service_status = "Not OK"
|
|
||||||
service_up_status = "Not OK"
|
|
||||||
service_name,last_name = event.split('.')
|
|
||||||
sysctl_show = run_systemctl_show(event)
|
|
||||||
load_state = sysctl_show.LoadState
|
|
||||||
if load_state == "loaded":
|
|
||||||
status = sysctl_show.UnitFileState
|
|
||||||
fail_reason = sysctl_show.Result
|
|
||||||
active_state = sysctl_show.ActiveState
|
|
||||||
sub_state = sysctl_show.SubState
|
|
||||||
srv_type = sysctl_show.Type
|
|
||||||
|
|
||||||
#Raise syslog for service state change
|
|
||||||
logger.log_info("{} service state changed to [{}/{}]".format(event, active_state, sub_state))
|
|
||||||
|
|
||||||
if status == "enabled" or status == "enabled-runtime" or status == "static":
|
|
||||||
if fail_reason == "success":
|
|
||||||
fail_reason = "-"
|
|
||||||
if (active_state == "active" and sub_state == "exited"):
|
|
||||||
service_status = "OK"
|
|
||||||
service_up_status = "OK"
|
|
||||||
unit_status = "OK"
|
|
||||||
elif active_state == "active" and sub_state == "running":
|
|
||||||
service_status = "OK"
|
|
||||||
init_state,app_fail_reason = get_app_ready_status(service_name, ap_db, st_db, config_db)
|
|
||||||
if init_state == "Up":
|
|
||||||
service_up_status = "OK"
|
|
||||||
unit_status = "OK"
|
|
||||||
else:
|
|
||||||
fail_reason = app_fail_reason
|
|
||||||
unit_status = "NOTOK"
|
|
||||||
if fail_reason == "docker start":
|
|
||||||
service_up_status = "Starting"
|
|
||||||
fail_reason = "-"
|
|
||||||
elif active_state == "activating":
|
|
||||||
service_status = "Starting"
|
|
||||||
service_up_status = "Starting"
|
|
||||||
elif active_state == "deactivating":
|
|
||||||
service_status = "Stopping"
|
|
||||||
service_up_status = "Stopping"
|
|
||||||
elif active_state == "inactive":
|
|
||||||
if srv_type == "oneshot" or service_name in spl_srv_list:
|
|
||||||
service_status = "OK"
|
|
||||||
service_up_status = "OK"
|
|
||||||
unit_status = "OK"
|
|
||||||
else:
|
|
||||||
unit_status = "NOTOK"
|
|
||||||
if fail_reason == "-":
|
|
||||||
fail_reason = "Inactive"
|
|
||||||
else:
|
|
||||||
unit_status = "NOTOK"
|
|
||||||
|
|
||||||
if event in core_srv_list:
|
|
||||||
coresrvs_dict[service_name] = {"service_status":service_status, "service_up_status":service_up_status, "fail_reason":fail_reason}
|
|
||||||
|
|
||||||
allsrvs_dict[service_name] = {"service_status":service_status, "service_up_status":service_up_status, "fail_reason":fail_reason}
|
|
||||||
|
|
||||||
return unit_status
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error("Get unit status {}-{}".format(service_name, str(e)))
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_system_status(ap_db, st_db, config_db):
|
|
||||||
""" Shows the system ready status"""
|
|
||||||
global dnsrvs_name_list
|
|
||||||
global allsrvs_status
|
|
||||||
scan_srv_list=[]
|
|
||||||
overall_ok_flag = 1
|
|
||||||
|
|
||||||
scan_srv_list=get_all_service_list(config_db)
|
|
||||||
#logger.log_info("scan_srv_list:[{}]".format(scan_srv_list))
|
|
||||||
|
|
||||||
for service in scan_srv_list:
|
|
||||||
ustate = get_unit_status(service,ap_db,st_db,config_db)
|
|
||||||
if ustate == "NOTOK":
|
|
||||||
overall_ok_flag &= 0
|
|
||||||
dnsrvs_name_list.append(service)
|
|
||||||
|
|
||||||
if overall_ok_flag == 1:
|
|
||||||
allsrvs_status = "UP"
|
|
||||||
return ("UP", "System is ready with all the services")
|
|
||||||
else:
|
|
||||||
allsrvs_status = "DOWN"
|
|
||||||
return ("DOWN", "System is not ready - one or more services are not up")
|
|
||||||
|
|
||||||
|
|
||||||
def get_core_system_status(ap_db, st_db,config_db):
|
|
||||||
""" Shows the core system ready status"""
|
|
||||||
global core_srv_list
|
|
||||||
global core_dnsrvs_name_list
|
|
||||||
global coresrvs_status
|
|
||||||
core_ok_flag = 1
|
|
||||||
|
|
||||||
for service in core_srv_list:
|
|
||||||
ustate = get_unit_status(service,ap_db,st_db,config_db)
|
|
||||||
if ustate == "NOTOK":
|
|
||||||
core_ok_flag &= 0
|
|
||||||
core_dnsrvs_name_list.append(service)
|
|
||||||
|
|
||||||
if core_ok_flag == 1:
|
|
||||||
coresrvs_status = "UP"
|
|
||||||
return ("UP", "System is ready")
|
|
||||||
else:
|
|
||||||
coresrvs_status = "DOWN"
|
|
||||||
return ("DOWN", "System is not ready - core services are not ok")
|
|
||||||
|
|
||||||
|
|
||||||
#Checks current system status
|
|
||||||
def check_system_status(event, st_db, ap_db, config_db):
|
|
||||||
global SYSTEM_STATE
|
|
||||||
(cstate, msg) = get_core_system_status(ap_db, st_db,config_db)
|
|
||||||
if SYSTEM_STATE != cstate:
|
|
||||||
SYSTEM_STATE=cstate
|
|
||||||
logger.log_notice(msg)
|
|
||||||
print_console_message(msg)
|
|
||||||
post_system_status_core(cstate, st_db)
|
|
||||||
|
|
||||||
|
|
||||||
global SYSTEM_ALLSRV_STATE
|
|
||||||
(astate, msg) = get_all_system_status(ap_db, st_db, config_db)
|
|
||||||
if SYSTEM_ALLSRV_STATE != astate:
|
|
||||||
SYSTEM_ALLSRV_STATE=astate
|
|
||||||
logger.log_info(msg)
|
|
||||||
print_console_message(msg)
|
|
||||||
post_system_status_all(astate, st_db)
|
|
||||||
|
|
||||||
#Checks the unit status and updates the system status
|
|
||||||
def check_unit_status(event, st_db, ap_db, config_db):
|
|
||||||
global SYSTEM_STATE
|
|
||||||
global SYSTEM_ALLSRV_STATE
|
|
||||||
global core_dnsrvs_name_list
|
|
||||||
global dnsrvs_name_list
|
|
||||||
global core_srv_list
|
|
||||||
global coresrvs_status
|
|
||||||
global allsrvs_status
|
|
||||||
global allsrvs_dict
|
|
||||||
|
|
||||||
#astate="DOWN"
|
|
||||||
#cstate="DOWN"
|
|
||||||
#msg=""
|
|
||||||
|
|
||||||
#check for core status
|
|
||||||
if event in core_srv_list:
|
|
||||||
ustate = get_unit_status(event,ap_db,st_db,config_db)
|
|
||||||
if ustate == "OK" and SYSTEM_STATE == "UP":
|
|
||||||
cstate = "UP"
|
|
||||||
elif ustate == "OK" and SYSTEM_STATE == "DOWN":
|
|
||||||
if event in core_dnsrvs_name_list:
|
|
||||||
core_dnsrvs_name_list.remove(event)
|
|
||||||
#need to check if need to set cstate to UP if this was the only down service before, which became UP now.
|
|
||||||
if len(core_dnsrvs_name_list) == 0:
|
|
||||||
cstate = "UP"
|
|
||||||
else:
|
|
||||||
cstate = "DOWN"
|
|
||||||
else:
|
|
||||||
if event not in core_dnsrvs_name_list:
|
|
||||||
core_dnsrvs_name_list.append(event)
|
|
||||||
cstate = "DOWN"
|
|
||||||
|
|
||||||
if cstate == "DOWN":
|
|
||||||
msg = "System is not ready - core services are not ok"
|
|
||||||
coresrvs_status = "DOWN"
|
|
||||||
else:
|
|
||||||
msg = "System is ready with core services"
|
|
||||||
coresrvs_status = "UP"
|
|
||||||
|
|
||||||
#logger.log_info("core - event:{} ustate:{} cstate:{} dnsrv:{}".format(event,ustate,cstate,core_dnsrvs_name_list))
|
|
||||||
|
|
||||||
if SYSTEM_STATE != cstate:
|
|
||||||
SYSTEM_STATE=cstate
|
|
||||||
logger.log_notice(msg)
|
|
||||||
print_console_message(msg)
|
|
||||||
post_system_status_core(cstate, st_db)
|
|
||||||
|
|
||||||
#check for all status
|
|
||||||
full_srv_list=get_all_service_list(config_db)
|
|
||||||
#logger.log_info("full srv list:{}".format(full_srv_list))
|
|
||||||
if event in full_srv_list:
|
|
||||||
ustate = get_unit_status(event,ap_db,st_db,config_db)
|
|
||||||
if ustate == "OK" and SYSTEM_ALLSRV_STATE == "UP":
|
|
||||||
astate = "UP"
|
|
||||||
elif ustate == "OK" and SYSTEM_ALLSRV_STATE == "DOWN":
|
|
||||||
if event in dnsrvs_name_list:
|
|
||||||
dnsrvs_name_list.remove(event)
|
|
||||||
#need to check if need to set cstate to UP if this was the only down service before, which became UP now.
|
|
||||||
if len(dnsrvs_name_list) == 0:
|
|
||||||
astate = "UP"
|
|
||||||
else:
|
|
||||||
astate = "DOWN"
|
|
||||||
else:
|
|
||||||
if event not in dnsrvs_name_list:
|
|
||||||
dnsrvs_name_list.append(event)
|
|
||||||
astate = "DOWN"
|
|
||||||
|
|
||||||
if astate == "DOWN":
|
|
||||||
msg = "System is not ready - one or more services are not ok"
|
|
||||||
allsrvs_status = "DOWN"
|
|
||||||
else:
|
|
||||||
msg = "System is ready with all the services"
|
|
||||||
allsrvs_status = "UP"
|
|
||||||
|
|
||||||
#logger.log_info("all - event:{} ustate:{} astate:{} dnsrvs:{}".format(event,ustate,astate,dnsrvs_name_list))
|
|
||||||
|
|
||||||
if SYSTEM_ALLSRV_STATE != astate:
|
|
||||||
SYSTEM_ALLSRV_STATE=astate
|
|
||||||
logger.log_info(msg)
|
|
||||||
print_console_message(msg)
|
|
||||||
post_system_status_all(astate, st_db)
|
|
||||||
|
|
||||||
else:
|
|
||||||
#if received event is not in current full service list but exists in global dictionary & list, then it should be removed from dictionary & list
|
|
||||||
srv_name,last_name = event.split('.')
|
|
||||||
if allsrvs_dict.__contains__(srv_name):
|
|
||||||
allsrvs_dict.pop(srv_name)
|
|
||||||
|
|
||||||
#also remove from dnsrvslist
|
|
||||||
if event in dnsrvs_name_list:
|
|
||||||
dnsrvs_name_list.remove(event)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##############################################################
|
|
||||||
# Listen for STATEDB state event #
|
|
||||||
##############################################################
|
|
||||||
|
|
||||||
def subscribe_statedb(queue):
|
|
||||||
from swsscommon import swsscommon
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
logger.log_info( "Listening for StateDB event, Pid:{}".format(os.getpid()))
|
|
||||||
SELECT_TIMEOUT_MS = 1000 * 2
|
|
||||||
|
|
||||||
db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, True)
|
|
||||||
sel = swsscommon.Select()
|
|
||||||
cst = swsscommon.SubscriberStateTable(db, STATE_FEATURE_TABLE_NAME)
|
|
||||||
sel.addSelectable(cst)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
(state, c) = sel.select(SELECT_TIMEOUT_MS)
|
|
||||||
if state == swsscommon.Select.OBJECT:
|
|
||||||
(key, op, cfvs) = cst.pop()
|
|
||||||
#logger.log_info(key+"featureevent")
|
|
||||||
key_ext = key+".service"
|
|
||||||
queue.put(key_ext)
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error( str(e))
|
|
||||||
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
|
|
||||||
def subscribe_statedb_event_thread(queue):
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
process_statedb_event = mp.Process(target=subscribe_statedb, args=(queue,) )
|
|
||||||
process_statedb_event.start()
|
|
||||||
process_statedb_event.join()
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error( str(e))
|
|
||||||
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
##############################################################
|
|
||||||
# Listening for System service event #
|
|
||||||
##############################################################
|
|
||||||
|
|
||||||
QUEUE=None
|
|
||||||
def OnJobRemoved(id, job, unit, result):
|
|
||||||
|
|
||||||
global QUEUE
|
|
||||||
|
|
||||||
#logger.log_debug('{}: Job Removed: {}, {}, {} '.format( id, job, unit, result))
|
|
||||||
if result == "done":
|
|
||||||
QUEUE.put(unit)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
#Sub process for listening the systemd event on dbus
|
|
||||||
def subscribe_service_event(queue):
|
|
||||||
import dbus
|
|
||||||
from gi.repository import GObject
|
|
||||||
from dbus.mainloop.glib import DBusGMainLoop
|
|
||||||
|
|
||||||
#logger.log_info( "Listening for systemd service event, Pid:{}".format(os.getpid()))
|
|
||||||
DBusGMainLoop(set_as_default=True)
|
|
||||||
|
|
||||||
bus = dbus.SystemBus()
|
|
||||||
systemd = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1')
|
|
||||||
manager = dbus.Interface(systemd, 'org.freedesktop.systemd1.Manager')
|
|
||||||
|
|
||||||
manager.Subscribe()
|
|
||||||
manager.connect_to_signal('JobRemoved', OnJobRemoved)
|
|
||||||
|
|
||||||
loop = GObject.MainLoop()
|
|
||||||
loop.run()
|
|
||||||
|
|
||||||
|
|
||||||
#Start the subprocess to listen the systemd service state change event
|
|
||||||
def subscribe_service_event_thread(queue):
|
|
||||||
retry_count=0
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
process_service_event = mp.Process(target=subscribe_service_event, args=(queue,) )
|
|
||||||
process_service_event.start()
|
|
||||||
process_service_event.join()
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error( str(e))
|
|
||||||
|
|
||||||
time.sleep(60)
|
|
||||||
retry_count+=1
|
|
||||||
if retry_count > 10:
|
|
||||||
logger.log_error("dbus subscription for systemd1 failed multiple times, exiting the subscription")
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def status_core(req):
|
|
||||||
"""shows the system status core"""
|
|
||||||
global coresrvs_dict
|
|
||||||
global coresrvs_status
|
|
||||||
coresrvs=""
|
|
||||||
|
|
||||||
sysready_lock.lock()
|
|
||||||
if coresrvs_status == "UP":
|
|
||||||
msg = "System is ready with core services"
|
|
||||||
else:
|
|
||||||
msg = "System is not ready - core services are not ok"
|
|
||||||
|
|
||||||
coresrvs+="{:30s} {:20s} {:20s} {:20s}\n".format("Service-Name","Service-Status","App-Ready-Status", "Fail-Reason")
|
|
||||||
for srv in coresrvs_dict.keys():
|
|
||||||
coresrvs+="{:30s} {:20s} {:20s} {:20s}\n".format(srv, coresrvs_dict[srv]['service_status'],
|
|
||||||
coresrvs_dict[srv]['service_up_status'],
|
|
||||||
coresrvs_dict[srv]['fail_reason'])
|
|
||||||
sysready_lock.unlock()
|
|
||||||
|
|
||||||
return {"status":msg, "coresrvs":coresrvs}
|
|
||||||
|
|
||||||
|
|
||||||
def status_all(req):
|
|
||||||
"""shows the system status all"""
|
|
||||||
global allsrvs_dict
|
|
||||||
global allsrvs_status
|
|
||||||
global dnsrvs_name_list
|
|
||||||
str1=" "
|
|
||||||
allsrvs=""
|
|
||||||
dnsrvs_name=""
|
|
||||||
|
|
||||||
sysready_lock.lock()
|
|
||||||
if allsrvs_status == "UP":
|
|
||||||
msg = "System is ready with all the services"
|
|
||||||
else:
|
|
||||||
msg = "System is not ready - one or more services are not ok"
|
|
||||||
|
|
||||||
allsrvs+="{:30s} {:20s} {:20s} {:20s}\n".format("Service-Name","Service-Status","App-Ready-Status", "Fail-Reason")
|
|
||||||
for srv in allsrvs_dict.keys():
|
|
||||||
allsrvs+="{:30s} {:20s} {:20s} {:20s}\n".format(srv, allsrvs_dict[srv]['service_status'],
|
|
||||||
allsrvs_dict[srv]['service_up_status'],
|
|
||||||
allsrvs_dict[srv]['fail_reason'])
|
|
||||||
|
|
||||||
dnsrvs_name=str1.join(dnsrvs_name_list)
|
|
||||||
sysready_lock.unlock()
|
|
||||||
|
|
||||||
return {"status":msg, "allsrvs":allsrvs, "dnsrvs_name":dnsrvs_name}
|
|
||||||
|
|
||||||
|
|
||||||
def sysready_listen():
|
|
||||||
SERVER_ADDRESS = '/var/run/sysready.socket'
|
|
||||||
|
|
||||||
if os.path.exists(SERVER_ADDRESS):
|
|
||||||
os.remove(SERVER_ADDRESS)
|
|
||||||
|
|
||||||
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
||||||
sock.bind(SERVER_ADDRESS)
|
|
||||||
os.chmod(SERVER_ADDRESS, stat.S_IRWXU |stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)
|
|
||||||
sock.listen(1)
|
|
||||||
fail_res={"status":False, "msg":None}
|
|
||||||
while True:
|
|
||||||
connection, client_address = sock.accept()
|
|
||||||
try:
|
|
||||||
request = connection.recv(10240)
|
|
||||||
#logger.log_info("sysready [ REQ ] {}".format(request))
|
|
||||||
if request is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
req=Dict2Obj(json.loads(request.decode('utf-8')))
|
|
||||||
|
|
||||||
response = globals()[req.command](req)
|
|
||||||
res=json.dumps(response)
|
|
||||||
#logger.log_info("sysready [ RES ] {}".format(res))
|
|
||||||
connection.sendall(res.encode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error("sysready {}".format(str(e)))
|
|
||||||
fail_res['msg']=str(e)
|
|
||||||
connection.sendall(json.dumps(fail_res).encode('utf-8'))
|
|
||||||
|
|
||||||
connection.close()
|
|
||||||
|
|
||||||
#sock.close() #lgtm [py/unreachable-statement]
|
|
||||||
|
|
||||||
|
|
||||||
def db_connect():
|
|
||||||
try:
|
|
||||||
st_db = SonicV2Connector()
|
|
||||||
st_db.connect(st_db.STATE_DB,True)
|
|
||||||
ap_db = SonicV2Connector()
|
|
||||||
ap_db.connect(ap_db.APPL_DB,True)
|
|
||||||
config_db = ConfigDBConnector()
|
|
||||||
config_db.connect()
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error("Error: Connection to the DB failed {}".format(str(e)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
return st_db,ap_db,config_db
|
|
||||||
|
|
||||||
|
|
||||||
def system_service():
|
|
||||||
|
|
||||||
global QUEUE
|
|
||||||
QUEUE = mp.Queue()
|
|
||||||
|
|
||||||
st_db,ap_db,config_db = db_connect()
|
|
||||||
|
|
||||||
thread_service_event = threading.Thread(target=subscribe_service_event_thread, name='service', args=(QUEUE,))
|
|
||||||
thread_service_event.start()
|
|
||||||
|
|
||||||
thread_sysready = threading.Thread(target=sysready_listen, name='sysready', args=())
|
|
||||||
thread_sysready.start()
|
|
||||||
|
|
||||||
thread_statedb = threading.Thread(target=subscribe_statedb_event_thread, name='statedb', args=(QUEUE,))
|
|
||||||
thread_statedb.start()
|
|
||||||
|
|
||||||
event = 'SERVICE_EVENT'
|
|
||||||
sysready_lock.lock()
|
|
||||||
#This is run only once when sysmonitor bootsup
|
|
||||||
check_system_status(event, st_db, ap_db, config_db)
|
|
||||||
sysready_lock.unlock()
|
|
||||||
|
|
||||||
# Queue to receive the STATEDB and Systemd state change event
|
|
||||||
while True:
|
|
||||||
event = QUEUE.get()
|
|
||||||
#logger.log_info( "System event [ "+event+" ] is received")
|
|
||||||
try:
|
|
||||||
sysready_lock.lock()
|
|
||||||
check_unit_status(event, st_db, ap_db, config_db)
|
|
||||||
sysready_lock.unlock()
|
|
||||||
except Exception as e:
|
|
||||||
logger.log_error( str(e))
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
|
|
||||||
#Main method to lanch the process in background
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--daemon", action='store_true', help="Start with daemon mode")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.daemon:
|
|
||||||
try:
|
|
||||||
pid = os.fork()
|
|
||||||
except OSError:
|
|
||||||
logger.log_error("Could not create a child process\n")
|
|
||||||
#parent
|
|
||||||
if pid != 0:
|
|
||||||
exit()
|
|
||||||
|
|
||||||
system_service()
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=Watchdog service for system services and app readiness
|
|
||||||
Requires=database.service
|
|
||||||
After=database.service
|
|
||||||
StartLimitIntervalSec=60
|
|
||||||
StartLimitBurst=3
|
|
||||||
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=forking
|
|
||||||
Restart=always
|
|
||||||
ExecStart=/usr/local/bin/sysmonitor.py --daemon
|
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
|
|
@ -6,7 +6,6 @@ After=rc-local.service database.service
|
|||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
ExecStart=/usr/local/bin/determine-reboot-cause
|
ExecStart=/usr/local/bin/determine-reboot-cause
|
||||||
RemainAfterExit=yes
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
@ -66,37 +66,6 @@ module sonic-feature{
|
|||||||
type stypes:feature_state;
|
type stypes:feature_state;
|
||||||
default "disabled";
|
default "disabled";
|
||||||
}
|
}
|
||||||
|
|
||||||
leaf check_up_status {
|
|
||||||
description "This configuration controls the system ready tool to check
|
|
||||||
the app ready/up status";
|
|
||||||
type boolean;
|
|
||||||
default false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
container HOST_FEATURE {
|
|
||||||
|
|
||||||
description "host feature table in config_db.json";
|
|
||||||
|
|
||||||
list HOST_FEATURE_LIST {
|
|
||||||
|
|
||||||
key "name";
|
|
||||||
|
|
||||||
leaf name {
|
|
||||||
description "host feature name in Host Feature table
|
|
||||||
Example - caclmgrd, hostcfgd";
|
|
||||||
type string {
|
|
||||||
length 1..32;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
leaf check_up_status {
|
|
||||||
description "This configuration controls the system ready tool to check
|
|
||||||
the app ready/up status";
|
|
||||||
type boolean;
|
|
||||||
default false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user