[systemd] ASIC status based service bringup on VOQ chassis (#7477)
Changes to allow starting per asic services like swss and syncd only if the platform vendor codedetects the asic is detected and notified. The systemd services ordering we want is database->database@->pmon->swss@->syncd@->teamd@->lldp@ There is also a requirement that management, telemetry, snmp dockers can start even if all asic services are not up. Why I did it For VOQ chassis, the fabric cards will have 1-N asics. Also, there could be multiple removable fabric cards. On the supervisor, swss and syncd containers need to be started only if the fabric-card is in Online state and respective asics are detected by the kernel. Using systemd, the dependent services can be in inactive state. How I did it Introduce a mechanism where all ASIC dependent service wait on its state to be published via PMON to REDIS. Once the subscription is received, the service proceeds to create respective dockers. For fixed platforms, systemd is unchanged i.e. the service bring up and docker creation happens in the start()/ExecStartPre routine of the .sh scripts. For VOQ chassis platform on supervisor, the service bringup skips docker creation in the start() routine, but does it in the wait()/ExecStart routine of the .sh scrips. Management dockers are decoupled from ASIC docker creation.
This commit is contained in:
parent
f9231723f9
commit
3fd6e8d500
@ -16,9 +16,9 @@ StartLimitBurst=3
|
||||
|
||||
[Service]
|
||||
User={{ sonicadmin_user }}
|
||||
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start{% if multi_instance == 'true' %} %i{% endif %}
|
||||
ExecStart=/usr/bin/{{docker_container_name}}.sh wait{% if multi_instance == 'true' %} %i{% endif %}
|
||||
ExecStop=/usr/bin/{{docker_container_name}}.sh stop{% if multi_instance == 'true' %} %i{% endif %}
|
||||
ExecStartPre=/usr/local/bin/{{docker_container_name}}.sh start{% if multi_instance == 'true' %} %i{% endif %}
|
||||
ExecStart=/usr/local/bin/{{docker_container_name}}.sh wait{% if multi_instance == 'true' %} %i{% endif %}
|
||||
ExecStop=/usr/local/bin/{{docker_container_name}}.sh stop{% if multi_instance == 'true' %} %i{% endif %}
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
|
@ -784,14 +784,17 @@ sudo LANG=C chroot $FILESYSTEM_ROOT fuser -km /sys || true
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT umount -lf /sys
|
||||
{% endif %}
|
||||
|
||||
# Copy service scripts (swss, syncd, bgp, teamd, radv)
|
||||
# Copy service scripts (swss, syncd, bgp, teamd, lldp, radv)
|
||||
sudo LANG=C cp $SCRIPTS_DIR/swss.sh $FILESYSTEM_ROOT/usr/local/bin/swss.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/syncd.sh $FILESYSTEM_ROOT/usr/local/bin/syncd.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/syncd_common.sh $FILESYSTEM_ROOT/usr/local/bin/syncd_common.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/gbsyncd.sh $FILESYSTEM_ROOT/usr/local/bin/gbsyncd.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/bgp.sh $FILESYSTEM_ROOT/usr/local/bin/bgp.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/teamd.sh $FILESYSTEM_ROOT/usr/local/bin/teamd.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/lldp.sh $FILESYSTEM_ROOT/usr/local/bin/lldp.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/radv.sh $FILESYSTEM_ROOT/usr/local/bin/radv.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/asic_status.sh $FILESYSTEM_ROOT/usr/local/bin/asic_status.sh
|
||||
sudo LANG=C cp $SCRIPTS_DIR/asic_status.py $FILESYSTEM_ROOT/usr/local/bin/asic_status.py
|
||||
|
||||
# Copy sonic-netns-exec script
|
||||
sudo LANG=C cp $SCRIPTS_DIR/sonic-netns-exec $FILESYSTEM_ROOT/usr/bin/sonic-netns-exec
|
||||
|
78
files/scripts/asic_status.py
Executable file
78
files/scripts/asic_status.py
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
bootstrap-asic
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
import sys
|
||||
from sonic_py_common import daemon_base
|
||||
from swsscommon import swsscommon
|
||||
from sonic_py_common import multi_asic
|
||||
from sonic_py_common.logger import Logger
|
||||
except ImportError as e:
|
||||
raise ImportError(str(e) + " - required module not found")
|
||||
|
||||
#
|
||||
# Constants ====================================================================
|
||||
#
|
||||
SYSLOG_IDENTIFIER = 'asic_status.py'
|
||||
CHASSIS_ASIC_INFO_TABLE = 'CHASSIS_ASIC_TABLE'
|
||||
SELECT_TIMEOUT_MSECS = 5000
|
||||
|
||||
def main():
|
||||
logger = Logger(SYSLOG_IDENTIFIER)
|
||||
logger.set_min_log_priority_info()
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
raise Exception('Pass service and valid asic-id as arguments')
|
||||
|
||||
service = sys.argv[1]
|
||||
args_asic_id = sys.argv[2]
|
||||
|
||||
# Get num asics
|
||||
num_asics = multi_asic.get_num_asics()
|
||||
if num_asics == 0:
|
||||
logger.log_error('Detected no asics on this platform for service {}'.format(service))
|
||||
sys.exit(1)
|
||||
|
||||
# Connect to STATE_DB and subscribe to chassis-module table notifications
|
||||
state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
|
||||
|
||||
sel = swsscommon.Select()
|
||||
sst = swsscommon.SubscriberStateTable(state_db, CHASSIS_ASIC_INFO_TABLE)
|
||||
sel.addSelectable(sst)
|
||||
|
||||
while True:
|
||||
(state, c) = sel.select(SELECT_TIMEOUT_MSECS)
|
||||
if state == swsscommon.Select.TIMEOUT:
|
||||
continue
|
||||
if state != swsscommon.Select.OBJECT:
|
||||
continue
|
||||
|
||||
(asic_key, asic_op, asic_fvp) = sst.pop()
|
||||
asic_id=re.search(r'\d+$', asic_key)
|
||||
global_asic_id = asic_id.group(0)
|
||||
|
||||
if asic_op == 'SET':
|
||||
asic_fvs = dict(asic_fvp)
|
||||
asic_name = asic_fvs.get('name')
|
||||
if asic_name is None:
|
||||
logger.log_info('Unable to get asic_name for asic{}'.format(global_asic_id))
|
||||
continue
|
||||
|
||||
if asic_name.startswith('FABRIC-CARD') is False:
|
||||
logger.log_info('Skipping module with asic_name {} for asic{}'.format(asic_name, global_asic_id))
|
||||
continue
|
||||
|
||||
if (global_asic_id == args_asic_id):
|
||||
logger.log_info('Detected asic{} is online'.format(global_asic_id))
|
||||
sys.exit(0)
|
||||
elif asic_op == 'DEL':
|
||||
logger.log_info('Detected asic{} is offline'.format(global_asic_id))
|
||||
sys.exit(1)
|
||||
else:
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
26
files/scripts/asic_status.sh
Executable file
26
files/scripts/asic_status.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
is_chassis_supervisor() {
|
||||
if [ -f /etc/sonic/chassisdb.conf ]; then
|
||||
true
|
||||
return
|
||||
fi
|
||||
false
|
||||
return
|
||||
}
|
||||
|
||||
check_asic_status() {
|
||||
# Ignore services that are not started in namespace.
|
||||
if [[ -z $DEV ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# For chassis supervisor, wait for asic to be online
|
||||
/usr/local/bin/asic_status.py $SERVICE $DEV
|
||||
if [[ $? = 0 ]]; then
|
||||
debug "$SERVICE successfully detected asic $DEV..."
|
||||
return 0
|
||||
fi
|
||||
debug "$SERVICE failed to detect asic $DEV..."
|
||||
return 1
|
||||
}
|
59
files/scripts/lldp.sh
Executable file
59
files/scripts/lldp.sh
Executable file
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
. /usr/local/bin/asic_status.sh
|
||||
|
||||
function debug()
|
||||
{
|
||||
/usr/bin/logger $1
|
||||
/bin/echo `date` "- $1" >> ${DEBUGLOG}
|
||||
}
|
||||
|
||||
start() {
|
||||
debug "Starting ${SERVICE}$DEV service..."
|
||||
|
||||
# On supervisor card, skip starting asic related services here. In wait(),
|
||||
# wait until the asic is detected by pmon and published via database.
|
||||
if ! is_chassis_supervisor; then
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
}
|
||||
|
||||
wait() {
|
||||
# On supervisor card, wait for asic to be online before starting the docker.
|
||||
if is_chassis_supervisor; then
|
||||
check_asic_status
|
||||
ASIC_STATUS=$?
|
||||
|
||||
# start service docker
|
||||
if [[ $ASIC_STATUS == 0 ]]; then
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
fi
|
||||
|
||||
/usr/bin/${SERVICE}.sh wait $DEV
|
||||
}
|
||||
|
||||
stop() {
|
||||
debug "Stopping ${SERVICE}$DEV service..."
|
||||
|
||||
/usr/bin/${SERVICE}.sh stop $DEV
|
||||
debug "Stopped ${SERVICE}$DEV service..."
|
||||
}
|
||||
|
||||
DEV=$2
|
||||
|
||||
SERVICE="lldp"
|
||||
DEBUGLOG="/tmp/lldp-debug$DEV.log"
|
||||
|
||||
case "$1" in
|
||||
start|wait|stop)
|
||||
$1
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|wait|stop}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
@ -12,6 +12,8 @@ if [[ -f /etc/sonic/${SERVICE}_multi_inst_dependent ]]; then
|
||||
MULTI_INST_DEPENDENT="${MULTI_INST_DEPENDENT} cat /etc/sonic/${SERVICE}_multi_inst_dependent"
|
||||
fi
|
||||
|
||||
. /usr/local/bin/asic_status.sh
|
||||
|
||||
function debug()
|
||||
{
|
||||
/usr/bin/logger $1
|
||||
@ -158,15 +160,31 @@ start() {
|
||||
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*'"
|
||||
fi
|
||||
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
# On supervisor card, skip starting asic related services here. In wait(),
|
||||
# wait until the asic is detected by pmon and published via database.
|
||||
if ! is_chassis_supervisor; then
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
|
||||
# Unlock has to happen before reaching out to peer service
|
||||
unlock_service_state_change
|
||||
}
|
||||
|
||||
wait() {
|
||||
# On supervisor card, wait for asic to be online before starting the docker.
|
||||
if is_chassis_supervisor; then
|
||||
check_asic_status
|
||||
ASIC_STATUS=$?
|
||||
|
||||
# start service docker
|
||||
if [[ $ASIC_STATUS == 0 ]]; then
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
fi
|
||||
|
||||
start_peer_and_dependent_services
|
||||
|
||||
# Allow some time for peer container to start
|
||||
|
@ -11,6 +11,8 @@
|
||||
# For examples of these, see gbsyncd.sh and syncd.sh.
|
||||
#
|
||||
|
||||
. /usr/local/bin/asic_status.sh
|
||||
|
||||
function debug()
|
||||
{
|
||||
/usr/bin/logger $1
|
||||
@ -104,14 +106,30 @@ start() {
|
||||
|
||||
startplatform
|
||||
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE} service..."
|
||||
# On supervisor card, skip starting asic related services here. In wait(),
|
||||
# wait until the asic is detected by pmon and published via database.
|
||||
if ! is_chassis_supervisor; then
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
|
||||
unlock_service_state_change
|
||||
}
|
||||
|
||||
wait() {
|
||||
# On supervisor card, wait for asic to be online before starting the docker.
|
||||
if is_chassis_supervisor; then
|
||||
check_asic_status
|
||||
ASIC_STATUS=$?
|
||||
|
||||
# start service docker
|
||||
if [[ $ASIC_STATUS == 0 ]]; then
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
fi
|
||||
|
||||
waitplatform
|
||||
|
||||
/usr/bin/${SERVICE}.sh wait $DEV
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
. /usr/local/bin/asic_status.sh
|
||||
|
||||
function debug()
|
||||
{
|
||||
/usr/bin/logger $1
|
||||
@ -48,12 +50,28 @@ start() {
|
||||
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
|
||||
debug "Fast boot flag: ${SERVICE}$DEV ${Fast_BOOT}."
|
||||
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
# On supervisor card, skip starting asic related services here. In wait(),
|
||||
# wait until the asic is detected by pmon and published via database.
|
||||
if ! is_chassis_supervisor; then
|
||||
# start service docker
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
}
|
||||
|
||||
wait() {
|
||||
# On supervisor card, wait for asic to be online before starting the docker.
|
||||
if is_chassis_supervisor; then
|
||||
check_asic_status
|
||||
ASIC_STATUS=$?
|
||||
|
||||
# start service docker
|
||||
if [[ $ASIC_STATUS == 0 ]]; then
|
||||
/usr/bin/${SERVICE}.sh start $DEV
|
||||
debug "Started ${SERVICE}$DEV service..."
|
||||
fi
|
||||
fi
|
||||
|
||||
/usr/bin/${SERVICE}.sh wait $DEV
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user