2018-08-22 15:02:32 -05:00
|
|
|
#!/bin/bash
|
|
|
|
|
2021-10-26 21:01:30 -05:00
|
|
|
DEV=$2
|
2018-09-24 18:35:01 -05:00
|
|
|
|
2021-10-26 21:01:30 -05:00
|
|
|
SERVICE="swss"
|
|
|
|
PEER="syncd"
|
|
|
|
DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
|
|
|
|
LOCKFILE="/tmp/swss-syncd-lock$DEV"
|
|
|
|
NAMESPACE_PREFIX="asic"
|
|
|
|
ETC_SONIC_PATH="/etc/sonic/"
|
2021-06-29 11:07:33 -05:00
|
|
|
|
|
|
|
|
[systemd] ASIC status based service bringup on VOQ chassis (#7477)
Changes to allow starting per asic services like swss and syncd only if the platform vendor codedetects the asic is detected and notified. The systemd services ordering we want is database->database@->pmon->swss@->syncd@->teamd@->lldp@
There is also a requirement that management, telemetry, snmp dockers can start even if all asic services are not up.
Why I did it
For VOQ chassis, the fabric cards will have 1-N asics. Also, there could be multiple removable fabric cards. On the supervisor, swss and syncd containers need to be started only if the fabric-card is in Online state and respective asics are detected by the kernel. Using systemd, the dependent services can be in inactive state.
How I did it
Introduce a mechanism where all ASIC dependent service wait on its state to be published via PMON to REDIS. Once the subscription is received, the service proceeds to create respective dockers.
For fixed platforms, systemd is unchanged i.e. the service bring up and docker creation happens in the start()/ExecStartPre routine of the .sh scripts.
For VOQ chassis platform on supervisor, the service bringup skips docker creation in the start() routine, but does it in the wait()/ExecStart routine of the .sh scrips.
Management dockers are decoupled from ASIC docker creation.
2021-07-28 01:02:49 -05:00
|
|
|
. /usr/local/bin/asic_status.sh
|
|
|
|
|
2018-09-24 18:35:01 -05:00
|
|
|
function debug()
|
|
|
|
{
|
2019-02-10 13:56:31 -06:00
|
|
|
/usr/bin/logger $1
|
2018-09-24 18:35:01 -05:00
|
|
|
/bin/echo `date` "- $1" >> ${DEBUGLOG}
|
|
|
|
}
|
|
|
|
|
2021-10-26 21:01:30 -05:00
|
|
|
function read_dependent_services()
|
|
|
|
{
|
|
|
|
# Update dependent list based on other packages requirements
|
|
|
|
if [[ -f ${ETC_SONIC_PATH}/${SERVICE}_dependent ]]; then
|
|
|
|
DEPENDENT="${DEPENDENT} $(cat ${ETC_SONIC_PATH}/${SERVICE}_dependent)"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ -f ${ETC_SONIC_PATH}/${SERVICE}_multi_inst_dependent ]]; then
|
2022-09-08 10:45:06 -05:00
|
|
|
MULTI_INST_DEPENDENT="${MULTI_INST_DEPENDENT} $(cat ${ETC_SONIC_PATH}/${SERVICE}_multi_inst_dependent)"
|
2021-10-26 21:01:30 -05:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2018-09-24 18:35:01 -05:00
|
|
|
function lock_service_state_change()
|
|
|
|
{
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Locking ${LOCKFILE} from ${SERVICE}$DEV service"
|
2018-09-24 18:35:01 -05:00
|
|
|
|
|
|
|
exec {LOCKFD}>${LOCKFILE}
|
|
|
|
/usr/bin/flock -x ${LOCKFD}
|
2022-08-10 22:57:07 -05:00
|
|
|
trap "/usr/bin/flock -u ${LOCKFD}" EXIT
|
2018-09-24 18:35:01 -05:00
|
|
|
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Locked ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
|
2018-09-24 18:35:01 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
function unlock_service_state_change()
|
|
|
|
{
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Unlocking ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
|
2018-09-24 18:35:01 -05:00
|
|
|
/usr/bin/flock -u ${LOCKFD}
|
|
|
|
}
|
|
|
|
|
|
|
|
function check_warm_boot()
|
|
|
|
{
|
2020-05-08 23:24:05 -05:00
|
|
|
SYSTEM_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|system" enable`
|
|
|
|
SERVICE_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|${SERVICE}" enable`
|
2018-09-24 18:35:01 -05:00
|
|
|
if [[ x"$SYSTEM_WARM_START" == x"true" ]] || [[ x"$SERVICE_WARM_START" == x"true" ]]; then
|
|
|
|
WARM_BOOT="true"
|
|
|
|
else
|
|
|
|
WARM_BOOT="false"
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2020-08-10 16:48:30 -05:00
|
|
|
function check_fast_boot()
|
|
|
|
{
|
2023-03-15 11:34:05 -05:00
|
|
|
SYSTEM_FAST_REBOOT=`sonic-db-cli STATE_DB hget "FAST_RESTART_ENABLE_TABLE|system" enable`
|
|
|
|
if [[ x"${SYSTEM_FAST_REBOOT}" == x"true" ]]; then
|
2020-08-10 16:48:30 -05:00
|
|
|
FAST_BOOT="true"
|
|
|
|
else
|
|
|
|
FAST_BOOT="false"
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2018-10-02 08:08:26 -05:00
|
|
|
function validate_restore_count()
|
2018-09-24 18:35:01 -05:00
|
|
|
{
|
|
|
|
if [[ x"$WARM_BOOT" == x"true" ]]; then
|
2020-05-08 23:24:05 -05:00
|
|
|
RESTORE_COUNT=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_TABLE|orchagent" restore_count`
|
2018-09-24 18:35:01 -05:00
|
|
|
# We have to make sure db data has not been flushed.
|
2018-10-02 08:08:26 -05:00
|
|
|
if [[ -z "$RESTORE_COUNT" ]]; then
|
2018-09-24 18:35:01 -05:00
|
|
|
WARM_BOOT="false"
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
function wait_for_database_service()
|
|
|
|
{
|
2018-08-22 15:02:32 -05:00
|
|
|
# Wait for redis server start before database clean
|
2020-05-08 23:24:05 -05:00
|
|
|
until [[ $($SONIC_DB_CLI PING | grep -c PONG) -gt 0 ]]; do
|
2020-05-06 17:41:28 -05:00
|
|
|
sleep 1;
|
|
|
|
done
|
2018-08-22 15:02:32 -05:00
|
|
|
|
2018-08-25 03:39:09 -05:00
|
|
|
# Wait for configDB initialization
|
2023-08-11 10:38:59 -05:00
|
|
|
until [[ $($SONIC_DB_CLI CONFIG_DB GET "CONFIG_DB_INITIALIZED") -eq 1 ]];
|
2018-08-25 03:39:09 -05:00
|
|
|
do sleep 1;
|
|
|
|
done
|
2018-09-24 18:35:01 -05:00
|
|
|
}
|
2018-08-25 03:39:09 -05:00
|
|
|
|
2018-11-03 14:32:46 -05:00
|
|
|
# This function cleans up the tables with specific prefixes from the database
|
|
|
|
# $1 the index of the database
|
|
|
|
# $2 the string of a list of table prefixes
|
|
|
|
function clean_up_tables()
|
|
|
|
{
|
2020-05-08 23:24:05 -05:00
|
|
|
$SONIC_DB_CLI $1 EVAL "
|
2018-11-03 14:32:46 -05:00
|
|
|
local tables = {$2}
|
|
|
|
for i = 1, table.getn(tables) do
|
|
|
|
local matches = redis.call('KEYS', tables[i])
|
|
|
|
for j,name in ipairs(matches) do
|
|
|
|
redis.call('DEL', name)
|
|
|
|
end
|
|
|
|
end" 0
|
|
|
|
}
|
|
|
|
|
2023-09-01 18:20:31 -05:00
|
|
|
# This function cleans up the chassis db table entries created ONLY by this asic
|
|
|
|
# This is used to do the clean up operation when the line card / asic reboots
|
|
|
|
# When the asic/lc is RE-booting, the chassis db server is supposed to be running
|
|
|
|
# in the supervisor. So the clean up is done when only the chassis db connectable.
|
|
|
|
# Otherwise no need to do the clean up since both the supervisor and line card may be
|
|
|
|
# rebooting (the whole chassis scenario)
|
|
|
|
# The clean up operation is required to delete only those entries created by
|
|
|
|
# the asic that is rebooted. Entries from the following tables are deleted in the order
|
|
|
|
# given below
|
|
|
|
# (1) SYSTEM_NEIGH
|
|
|
|
# (2) SYSTEM_INTERFACE
|
|
|
|
# (3) SYSTEM_LAG_MEMBER_TABLE
|
|
|
|
# (4) SYSTEM_LAG_TABLE
|
|
|
|
# (5) The corresponding LAG IDs of the entries from SYSTEM_LAG_TABLE
|
|
|
|
# SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately
|
|
|
|
function clean_up_chassis_db_tables()
|
|
|
|
{
|
|
|
|
|
|
|
|
switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'`
|
|
|
|
|
|
|
|
# Run clean up only in swss running for voq switches
|
|
|
|
if is_chassis_supervisor || [[ $switch_type != 'voq' ]]; then
|
|
|
|
return
|
|
|
|
fi
|
|
|
|
|
2023-09-14 16:07:15 -05:00
|
|
|
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
|
|
|
|
return
|
|
|
|
fi
|
|
|
|
|
|
|
|
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
|
|
|
|
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
|
|
|
|
|
2023-09-01 18:20:31 -05:00
|
|
|
# First, delete SYSTEM_NEIGH entries
|
2023-09-14 16:07:15 -05:00
|
|
|
num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
|
|
|
|
local nn = 0
|
2023-09-01 18:20:31 -05:00
|
|
|
local host = string.gsub(ARGV[1], '%-', '%%-')
|
|
|
|
local dev = ARGV[2]
|
|
|
|
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
|
|
|
|
local keylist = redis.call('KEYS', 'SYSTEM_NEIGH*')
|
|
|
|
for j,key in ipairs(keylist) do
|
|
|
|
if string.match(key, ps) ~= nil then
|
|
|
|
redis.call('DEL', key)
|
2023-09-14 16:07:15 -05:00
|
|
|
nn = nn + 1
|
2023-09-01 18:20:31 -05:00
|
|
|
end
|
|
|
|
end
|
2023-09-14 16:07:15 -05:00
|
|
|
return nn" 0 $lc $asic`
|
|
|
|
|
|
|
|
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
# Wait for some time before deleting system interface so that the system interface's "object in use"
|
|
|
|
# is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount
|
|
|
|
# but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use"
|
|
|
|
# error and aborts.
|
2023-09-14 16:07:15 -05:00
|
|
|
# This delay is needed only if some system neighbors were deleted.
|
2023-09-01 18:20:31 -05:00
|
|
|
|
2023-09-14 16:07:15 -05:00
|
|
|
if [[ $num_neigh > 0 ]]; then
|
|
|
|
sleep 30
|
|
|
|
fi
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
# Next, delete SYSTEM_INTERFACE entries
|
2023-09-14 16:07:15 -05:00
|
|
|
num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
|
|
|
|
local nsi = 0
|
2023-09-01 18:20:31 -05:00
|
|
|
local host = string.gsub(ARGV[1], '%-', '%%-')
|
|
|
|
local dev = ARGV[2]
|
|
|
|
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
|
|
|
|
local keylist = redis.call('KEYS', 'SYSTEM_INTERFACE*')
|
|
|
|
for j,key in ipairs(keylist) do
|
|
|
|
if string.match(key, ps) ~= nil then
|
|
|
|
redis.call('DEL', key)
|
2023-09-14 16:07:15 -05:00
|
|
|
nsi = nsi + 1
|
2023-09-01 18:20:31 -05:00
|
|
|
end
|
|
|
|
end
|
2023-09-14 16:07:15 -05:00
|
|
|
return nsi" 0 $lc $asic`
|
|
|
|
|
|
|
|
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries
|
2023-09-14 16:07:15 -05:00
|
|
|
num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
|
|
|
|
local nlm = 0
|
2023-09-01 18:20:31 -05:00
|
|
|
local host = string.gsub(ARGV[1], '%-', '%%-')
|
|
|
|
local dev = ARGV[2]
|
|
|
|
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
|
|
|
|
local keylist = redis.call('KEYS', 'SYSTEM_LAG_MEMBER_TABLE*')
|
|
|
|
for j,key in ipairs(keylist) do
|
|
|
|
if string.match(key, ps) ~= nil then
|
|
|
|
redis.call('DEL', key)
|
2023-09-14 16:07:15 -05:00
|
|
|
nlm = nlm + 1
|
2023-09-01 18:20:31 -05:00
|
|
|
end
|
|
|
|
end
|
2023-09-14 16:07:15 -05:00
|
|
|
return nlm" 0 $lc $asic`
|
|
|
|
|
|
|
|
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
# Wait for some time before deleting system lag so that the all the memebers of the
|
|
|
|
# system lag will be cleared.
|
2023-09-14 16:07:15 -05:00
|
|
|
# This delay is needed only if some system lag members were deleted
|
2023-09-01 18:20:31 -05:00
|
|
|
|
2023-09-14 16:07:15 -05:00
|
|
|
if [[ $num_lag_mem > 0 ]]; then
|
|
|
|
sleep 15
|
|
|
|
fi
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
|
2023-09-14 16:07:15 -05:00
|
|
|
num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
|
|
|
|
local nsl = 0
|
2023-09-01 18:20:31 -05:00
|
|
|
local host = string.gsub(ARGV[1], '%-', '%%-')
|
|
|
|
local dev = ARGV[2]
|
|
|
|
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
|
|
|
|
local keylist = redis.call('KEYS', 'SYSTEM_LAG_TABLE*')
|
|
|
|
for j,key in ipairs(keylist) do
|
|
|
|
local lagname = string.match(key, ps)
|
|
|
|
if lagname ~= nil then
|
|
|
|
redis.call('DEL', key)
|
|
|
|
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
|
|
|
|
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
|
|
|
|
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
|
2023-09-14 16:07:15 -05:00
|
|
|
nsl = nsl + 1
|
2023-09-01 18:20:31 -05:00
|
|
|
end
|
|
|
|
end
|
2023-09-14 16:07:15 -05:00
|
|
|
return nsl" 0 $lc $asic`
|
|
|
|
|
|
|
|
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
|
2023-09-01 18:20:31 -05:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-08-08 17:45:17 -05:00
|
|
|
start_peer_and_dependent_services() {
|
2019-03-04 18:46:55 -06:00
|
|
|
check_warm_boot
|
|
|
|
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]]; then
|
2022-04-20 06:02:49 -05:00
|
|
|
for peer in ${PEER}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
/bin/systemctl start ${peer}@$DEV
|
|
|
|
else
|
|
|
|
/bin/systemctl start ${peer}
|
|
|
|
fi
|
|
|
|
done
|
2019-08-08 17:45:17 -05:00
|
|
|
for dep in ${DEPENDENT}; do
|
2019-12-02 17:54:55 -06:00
|
|
|
/bin/systemctl start ${dep}
|
2019-08-08 17:45:17 -05:00
|
|
|
done
|
2020-03-31 12:06:19 -05:00
|
|
|
for dep in ${MULTI_INST_DEPENDENT}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
/bin/systemctl start ${dep}@$DEV
|
|
|
|
else
|
|
|
|
/bin/systemctl start ${dep}
|
|
|
|
fi
|
|
|
|
done
|
2019-08-08 17:45:17 -05:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
stop_peer_and_dependent_services() {
|
2020-08-10 16:48:30 -05:00
|
|
|
# if warm/fast start enabled or peer lock exists, don't stop peer service docker
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]] && [[ x"$FAST_BOOT" != x"true" ]]; then
|
2020-03-31 12:06:19 -05:00
|
|
|
for dep in ${MULTI_INST_DEPENDENT}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
/bin/systemctl stop ${dep}@$DEV
|
|
|
|
else
|
|
|
|
/bin/systemctl stop ${dep}
|
|
|
|
fi
|
|
|
|
done
|
2021-02-15 08:05:34 -06:00
|
|
|
for dep in ${DEPENDENT}; do
|
|
|
|
/bin/systemctl stop ${dep}
|
|
|
|
done
|
2022-04-20 06:02:49 -05:00
|
|
|
for peer in ${PEER}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
/bin/systemctl stop ${peer}@$DEV
|
|
|
|
else
|
|
|
|
/bin/systemctl stop ${peer}
|
|
|
|
fi
|
|
|
|
done
|
2019-03-04 18:46:55 -06:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2018-09-24 18:35:01 -05:00
|
|
|
start() {
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Starting ${SERVICE}$DEV service..."
|
2018-09-24 18:35:01 -05:00
|
|
|
|
|
|
|
lock_service_state_change
|
|
|
|
|
|
|
|
wait_for_database_service
|
|
|
|
check_warm_boot
|
2018-10-02 08:08:26 -05:00
|
|
|
validate_restore_count
|
2018-08-25 03:39:09 -05:00
|
|
|
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
|
2018-09-24 18:35:01 -05:00
|
|
|
|
|
|
|
# Don't flush DB during warm boot
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]]; then
|
2019-02-19 23:48:43 -06:00
|
|
|
debug "Flushing APP, ASIC, COUNTER, CONFIG, and partial STATE databases ..."
|
2020-05-08 23:24:05 -05:00
|
|
|
$SONIC_DB_CLI APPL_DB FLUSHDB
|
|
|
|
$SONIC_DB_CLI ASIC_DB FLUSHDB
|
|
|
|
$SONIC_DB_CLI COUNTERS_DB FLUSHDB
|
|
|
|
$SONIC_DB_CLI FLEX_COUNTER_DB FLUSHDB
|
2021-08-31 17:52:48 -05:00
|
|
|
$SONIC_DB_CLI GB_ASIC_DB FLUSHDB
|
|
|
|
$SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB
|
2021-10-22 13:45:19 -05:00
|
|
|
$SONIC_DB_CLI RESTAPI_DB FLUSHDB
|
2023-09-14 16:07:15 -05:00
|
|
|
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VNET_ROUTE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*', 'SYSTEM_NEIGH_TABLE*'"
|
2021-11-24 12:31:06 -06:00
|
|
|
$SONIC_DB_CLI APPL_STATE_DB FLUSHDB
|
2023-09-01 18:20:31 -05:00
|
|
|
clean_up_chassis_db_tables
|
2022-07-28 04:03:22 -05:00
|
|
|
rm -rf /tmp/cache
|
2018-08-22 15:02:32 -05:00
|
|
|
fi
|
|
|
|
|
[systemd] ASIC status based service bringup on VOQ chassis (#7477)
Changes to allow starting per asic services like swss and syncd only if the platform vendor codedetects the asic is detected and notified. The systemd services ordering we want is database->database@->pmon->swss@->syncd@->teamd@->lldp@
There is also a requirement that management, telemetry, snmp dockers can start even if all asic services are not up.
Why I did it
For VOQ chassis, the fabric cards will have 1-N asics. Also, there could be multiple removable fabric cards. On the supervisor, swss and syncd containers need to be started only if the fabric-card is in Online state and respective asics are detected by the kernel. Using systemd, the dependent services can be in inactive state.
How I did it
Introduce a mechanism where all ASIC dependent service wait on its state to be published via PMON to REDIS. Once the subscription is received, the service proceeds to create respective dockers.
For fixed platforms, systemd is unchanged i.e. the service bring up and docker creation happens in the start()/ExecStartPre routine of the .sh scripts.
For VOQ chassis platform on supervisor, the service bringup skips docker creation in the start() routine, but does it in the wait()/ExecStart routine of the .sh scrips.
Management dockers are decoupled from ASIC docker creation.
2021-07-28 01:02:49 -05:00
|
|
|
# On supervisor card, skip starting asic related services here. In wait(),
|
|
|
|
# wait until the asic is detected by pmon and published via database.
|
|
|
|
if ! is_chassis_supervisor; then
|
|
|
|
# start service docker
|
|
|
|
/usr/bin/${SERVICE}.sh start $DEV
|
|
|
|
debug "Started ${SERVICE}$DEV service..."
|
|
|
|
fi
|
2018-09-24 18:35:01 -05:00
|
|
|
|
|
|
|
# Unlock has to happen before reaching out to peer service
|
|
|
|
unlock_service_state_change
|
2019-03-02 17:28:34 -06:00
|
|
|
}
|
|
|
|
|
2019-03-08 12:59:41 -06:00
|
|
|
wait() {
|
[systemd] ASIC status based service bringup on VOQ chassis (#7477)
Changes to allow starting per asic services like swss and syncd only if the platform vendor codedetects the asic is detected and notified. The systemd services ordering we want is database->database@->pmon->swss@->syncd@->teamd@->lldp@
There is also a requirement that management, telemetry, snmp dockers can start even if all asic services are not up.
Why I did it
For VOQ chassis, the fabric cards will have 1-N asics. Also, there could be multiple removable fabric cards. On the supervisor, swss and syncd containers need to be started only if the fabric-card is in Online state and respective asics are detected by the kernel. Using systemd, the dependent services can be in inactive state.
How I did it
Introduce a mechanism where all ASIC dependent service wait on its state to be published via PMON to REDIS. Once the subscription is received, the service proceeds to create respective dockers.
For fixed platforms, systemd is unchanged i.e. the service bring up and docker creation happens in the start()/ExecStartPre routine of the .sh scripts.
For VOQ chassis platform on supervisor, the service bringup skips docker creation in the start() routine, but does it in the wait()/ExecStart routine of the .sh scrips.
Management dockers are decoupled from ASIC docker creation.
2021-07-28 01:02:49 -05:00
|
|
|
# On supervisor card, wait for asic to be online before starting the docker.
|
|
|
|
if is_chassis_supervisor; then
|
|
|
|
check_asic_status
|
|
|
|
ASIC_STATUS=$?
|
|
|
|
|
|
|
|
# start service docker
|
|
|
|
if [[ $ASIC_STATUS == 0 ]]; then
|
|
|
|
/usr/bin/${SERVICE}.sh start $DEV
|
|
|
|
debug "Started ${SERVICE}$DEV service..."
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
2019-08-08 17:45:17 -05:00
|
|
|
start_peer_and_dependent_services
|
2019-11-09 12:26:39 -06:00
|
|
|
|
|
|
|
# Allow some time for peer container to start
|
|
|
|
# NOTE: This assumes Docker containers share the same names as their
|
|
|
|
# corresponding services
|
|
|
|
for SECS in {1..60}; do
|
2022-04-20 06:02:49 -05:00
|
|
|
ALL_PEERS_RUNNING=true
|
|
|
|
for peer in ${PEER}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
RUNNING=$(docker inspect -f '{{.State.Running}}' ${peer}$DEV)
|
|
|
|
else
|
|
|
|
RUNNING=$(docker inspect -f '{{.State.Running}}' ${peer})
|
|
|
|
fi
|
|
|
|
if [[ x"$RUNNING" != x"true" ]]; then
|
|
|
|
ALL_PEERS_RUNNING=false
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
[docker-teamd]: Add teamd as a depedent service to swss (#5628)
**- Why I did it**
On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present.
**- How I did it**
Add the teamd as a dependent service for swss
Updated the docker-wait script to handle service and dependent services separately.
Handle the case of warm-restart for the dependent service
**- How to verify it**
Verified the following scenario's with the following testbed
VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs
1. Stop teamd docker alone
> swss, syncd dockers seen going away
> The LAG reference count error messages seen for a while till swss docker stops.
> Dockers back up.
2. Enable WR mode for teamd. Stop teamd docker alone
> swss, syncd dockers not removed.
> The LAG reference count error messages not seen
> Repeated stop teamd docker test - same result, no effect on swss/syncd.
3. Stop swss docker.
> swss, teamd, syncd goes off - dockers comes back correctly, interfaces up
4. Enable WR mode for swss . Stop swss docker
> swss goes off not affecting syncd/teamd dockers.
5. Config reload
> no reference counter error seen, dockers comes back correctly, with interfaces up
6. Warm reboot, observations below
> swss docker goes off first
> teamd + syncd goes off to the end of WR process.
> dockers comes back up fine.
> ping traffic between VM's was NOT HIT
7. Fast reboot, observations below
> teamd goes off first ( **confirmed swss don't exit here** )
> swss goes off next
> syncd goes away at the end of the FR process
> dockers comes back up fine.
> there is a traffic HIT as per fast-reboot
8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
|
|
|
ALL_DEPS_RUNNING=true
|
|
|
|
for dep in ${MULTI_INST_DEPENDENT}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep}$DEV)
|
|
|
|
else
|
|
|
|
DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep})
|
|
|
|
fi
|
|
|
|
if [[ x"$DEP_RUNNING" != x"true" ]]; then
|
|
|
|
ALL_DEPS_RUNNING=false
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
2022-04-20 06:02:49 -05:00
|
|
|
if [[ x"$ALL_PEERS_RUNNING" == x"true" && x"$ALL_DEPS_RUNNING" == x"true" ]]; then
|
2019-11-09 12:26:39 -06:00
|
|
|
break
|
|
|
|
else
|
|
|
|
sleep 1
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
# NOTE: This assumes Docker containers share the same names as their
|
|
|
|
# corresponding services
|
[docker-teamd]: Add teamd as a depedent service to swss (#5628)
**- Why I did it**
On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present.
**- How I did it**
Add the teamd as a dependent service for swss
Updated the docker-wait script to handle service and dependent services separately.
Handle the case of warm-restart for the dependent service
**- How to verify it**
Verified the following scenario's with the following testbed
VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs
1. Stop teamd docker alone
> swss, syncd dockers seen going away
> The LAG reference count error messages seen for a while till swss docker stops.
> Dockers back up.
2. Enable WR mode for teamd. Stop teamd docker alone
> swss, syncd dockers not removed.
> The LAG reference count error messages not seen
> Repeated stop teamd docker test - same result, no effect on swss/syncd.
3. Stop swss docker.
> swss, teamd, syncd goes off - dockers comes back correctly, interfaces up
4. Enable WR mode for swss . Stop swss docker
> swss goes off not affecting syncd/teamd dockers.
5. Config reload
> no reference counter error seen, dockers comes back correctly, with interfaces up
6. Warm reboot, observations below
> swss docker goes off first
> teamd + syncd goes off to the end of WR process.
> dockers comes back up fine.
> ping traffic between VM's was NOT HIT
7. Fast reboot, observations below
> teamd goes off first ( **confirmed swss don't exit here** )
> swss goes off next
> syncd goes away at the end of the FR process
> dockers comes back up fine.
> there is a traffic HIT as per fast-reboot
8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
|
|
|
for dep in ${MULTI_INST_DEPENDENT}; do
|
|
|
|
if [[ ! -z $DEV ]]; then
|
|
|
|
ALL_DEPS="$ALL_DEPS ${dep}$DEV"
|
|
|
|
else
|
|
|
|
ALL_DEPS="$ALL_DEPS ${dep}"
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
2020-03-31 12:06:19 -05:00
|
|
|
if [[ ! -z $DEV ]]; then
|
2022-04-20 06:02:49 -05:00
|
|
|
/usr/bin/docker-wait-any -s ${SERVICE}$DEV -d `printf "%s$DEV " ${PEER}` ${ALL_DEPS}
|
2020-03-31 12:06:19 -05:00
|
|
|
else
|
[docker-teamd]: Add teamd as a depedent service to swss (#5628)
**- Why I did it**
On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present.
**- How I did it**
Add the teamd as a dependent service for swss
Updated the docker-wait script to handle service and dependent services separately.
Handle the case of warm-restart for the dependent service
**- How to verify it**
Verified the following scenario's with the following testbed
VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs
1. Stop teamd docker alone
> swss, syncd dockers seen going away
> The LAG reference count error messages seen for a while till swss docker stops.
> Dockers back up.
2. Enable WR mode for teamd. Stop teamd docker alone
> swss, syncd dockers not removed.
> The LAG reference count error messages not seen
> Repeated stop teamd docker test - same result, no effect on swss/syncd.
3. Stop swss docker.
> swss, teamd, syncd goes off - dockers comes back correctly, interfaces up
4. Enable WR mode for swss . Stop swss docker
> swss goes off not affecting syncd/teamd dockers.
5. Config reload
> no reference counter error seen, dockers comes back correctly, with interfaces up
6. Warm reboot, observations below
> swss docker goes off first
> teamd + syncd goes off to the end of WR process.
> dockers comes back up fine.
> ping traffic between VM's was NOT HIT
7. Fast reboot, observations below
> teamd goes off first ( **confirmed swss don't exit here** )
> swss goes off next
> syncd goes away at the end of the FR process
> dockers comes back up fine.
> there is a traffic HIT as per fast-reboot
8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
|
|
|
/usr/bin/docker-wait-any -s ${SERVICE} -d ${PEER} ${ALL_DEPS}
|
2020-03-31 12:06:19 -05:00
|
|
|
fi
|
2018-08-22 15:02:32 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
stop() {
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Stopping ${SERVICE}$DEV service..."
|
2018-08-25 03:39:09 -05:00
|
|
|
|
2018-09-24 18:35:01 -05:00
|
|
|
[[ -f ${LOCKFILE} ]] || /usr/bin/touch ${LOCKFILE}
|
|
|
|
|
|
|
|
lock_service_state_change
|
|
|
|
check_warm_boot
|
2020-03-31 12:06:19 -05:00
|
|
|
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
|
2020-08-10 16:48:30 -05:00
|
|
|
check_fast_boot
|
|
|
|
debug "Fast boot flag: ${SERVICE}$DEV ${FAST_BOOT}."
|
2018-09-24 18:35:01 -05:00
|
|
|
|
2020-08-10 16:48:30 -05:00
|
|
|
# For WARM/FAST boot do not perform service stop
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]] && [[ x"$FAST_BOOT" != x"true" ]]; then
|
|
|
|
/usr/bin/${SERVICE}.sh stop $DEV
|
|
|
|
debug "Stopped ${SERVICE}$DEV service..."
|
|
|
|
else
|
|
|
|
debug "Killing Docker swss..."
|
|
|
|
/usr/bin/docker kill swss &> /dev/null || debug "Docker swss is not running ($?) ..."
|
|
|
|
fi
|
2018-08-25 03:39:09 -05:00
|
|
|
|
2019-12-16 09:58:16 -06:00
|
|
|
# Flush FAST_REBOOT table when swss needs to stop. The only
|
|
|
|
# time when this would take effect is when fast-reboot
|
|
|
|
# encountered error, e.g. syncd crashed. And swss needs to
|
|
|
|
# be restarted.
|
2020-08-10 16:48:30 -05:00
|
|
|
if [[ x"$FAST_BOOT" != x"true" ]]; then
|
2023-03-15 11:34:05 -05:00
|
|
|
debug "Clearing FAST_RESTART_ENABLE_TABLE flag..."
|
|
|
|
sonic-db-cli STATE_DB hset "FAST_RESTART_ENABLE_TABLE|system" "enable" "false"
|
2020-08-10 16:48:30 -05:00
|
|
|
fi
|
2018-09-24 18:35:01 -05:00
|
|
|
# Unlock has to happen before reaching out to peer service
|
|
|
|
unlock_service_state_change
|
2018-08-22 15:02:32 -05:00
|
|
|
|
2019-08-08 17:45:17 -05:00
|
|
|
stop_peer_and_dependent_services
|
2018-08-22 15:02:32 -05:00
|
|
|
}
|
|
|
|
|
2022-04-20 06:02:49 -05:00
|
|
|
function check_peer_gbsyncd()
|
|
|
|
{
|
|
|
|
GEARBOX_CONFIG=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/gearbox_config.json
|
|
|
|
|
|
|
|
if [ -f $GEARBOX_CONFIG ]; then
|
|
|
|
PEER="$PEER gbsyncd"
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2022-09-08 10:45:06 -05:00
|
|
|
function check_macsec()
|
|
|
|
{
|
2022-12-06 03:23:46 -06:00
|
|
|
MACSEC_STATE=`$SONIC_DB_CLI CONFIG_DB hget 'FEATURE|macsec' state`
|
2022-09-08 10:45:06 -05:00
|
|
|
|
|
|
|
if [[ ${MACSEC_STATE} == 'enabled' ]]; then
|
|
|
|
if [ "$DEV" ]; then
|
|
|
|
DEPENDENT="${DEPENDENT} macsec@${DEV}"
|
|
|
|
else
|
|
|
|
DEPENDENT="${DEPENDENT} macsec"
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2023-08-11 10:39:22 -05:00
|
|
|
function check_add_bgp_dependency()
|
|
|
|
{
|
|
|
|
if ! is_chassis_supervisor; then
|
|
|
|
if [ "$DEV" ]; then
|
|
|
|
DEPENDENT="${DEPENDENT} bgp@${DEV}"
|
|
|
|
else
|
|
|
|
DEPENDENT="${DEPENDENT} bgp"
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
}
|
2022-11-29 15:47:37 -06:00
|
|
|
function check_ports_present()
|
|
|
|
{
|
|
|
|
PORT_CONFIG_INI=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/port_config.ini
|
|
|
|
HWSKU_JSON=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/hwsku.json
|
|
|
|
|
|
|
|
if [[ -f $PORT_CONFIG_INI ]] || [[ -f $HWSKU_JSON ]]; then
|
|
|
|
return 0
|
|
|
|
fi
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
|
|
|
|
# DEPENDENT initially contains namespace independent services
|
|
|
|
# namespace specific services are added later in this script.
|
|
|
|
DEPENDENT="radv"
|
|
|
|
MULTI_INST_DEPENDENT=""
|
|
|
|
|
2020-03-31 12:06:19 -05:00
|
|
|
if [ "$DEV" ]; then
|
2020-05-08 23:24:05 -05:00
|
|
|
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
|
|
|
|
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
|
2020-03-31 12:06:19 -05:00
|
|
|
else
|
2020-05-08 23:24:05 -05:00
|
|
|
NET_NS=""
|
|
|
|
SONIC_DB_CLI="sonic-db-cli"
|
2020-03-31 12:06:19 -05:00
|
|
|
fi
|
|
|
|
|
2022-11-29 15:47:37 -06:00
|
|
|
PLATFORM=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' platform`
|
|
|
|
HWSKU=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' hwsku`
|
|
|
|
|
2022-04-20 06:02:49 -05:00
|
|
|
check_peer_gbsyncd
|
2022-09-08 10:45:06 -05:00
|
|
|
check_macsec
|
2023-08-11 10:39:22 -05:00
|
|
|
check_add_bgp_dependency
|
2022-11-29 15:47:37 -06:00
|
|
|
check_ports_present
|
|
|
|
PORTS_PRESENT=$?
|
|
|
|
|
|
|
|
if [[ $PORTS_PRESENT == 0 ]]; then
|
|
|
|
MULTI_INST_DEPENDENT="teamd"
|
|
|
|
fi
|
|
|
|
|
2021-10-26 21:01:30 -05:00
|
|
|
read_dependent_services
|
|
|
|
|
2018-08-22 15:02:32 -05:00
|
|
|
case "$1" in
|
2019-03-08 12:59:41 -06:00
|
|
|
start|wait|stop)
|
2018-08-22 15:02:32 -05:00
|
|
|
$1
|
|
|
|
;;
|
|
|
|
*)
|
2019-03-08 12:59:41 -06:00
|
|
|
echo "Usage: $0 {start|wait|stop}"
|
2018-08-22 15:02:32 -05:00
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|