sonic-buildimage/files/scripts/swss.sh

520 lines
16 KiB
Bash
Raw Normal View History

#!/bin/bash
DEV=$2
SERVICE="swss"
PEER="syncd"
DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
LOCKFILE="/tmp/swss-syncd-lock$DEV"
NAMESPACE_PREFIX="asic"
ETC_SONIC_PATH="/etc/sonic/"
. /usr/local/bin/asic_status.sh
function debug()
{
/usr/bin/logger $1
/bin/echo `date` "- $1" >> ${DEBUGLOG}
}
function read_dependent_services()
{
# Update dependent list based on other packages requirements
if [[ -f ${ETC_SONIC_PATH}/${SERVICE}_dependent ]]; then
DEPENDENT="${DEPENDENT} $(cat ${ETC_SONIC_PATH}/${SERVICE}_dependent)"
fi
if [[ -f ${ETC_SONIC_PATH}/${SERVICE}_multi_inst_dependent ]]; then
MULTI_INST_DEPENDENT="${MULTI_INST_DEPENDENT} $(cat ${ETC_SONIC_PATH}/${SERVICE}_multi_inst_dependent)"
fi
}
function lock_service_state_change()
{
debug "Locking ${LOCKFILE} from ${SERVICE}$DEV service"
exec {LOCKFD}>${LOCKFILE}
/usr/bin/flock -x ${LOCKFD}
trap "/usr/bin/flock -u ${LOCKFD}" EXIT
debug "Locked ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
}
function unlock_service_state_change()
{
debug "Unlocking ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
/usr/bin/flock -u ${LOCKFD}
}
function check_warm_boot()
{
SYSTEM_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|system" enable`
SERVICE_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|${SERVICE}" enable`
if [[ x"$SYSTEM_WARM_START" == x"true" ]] || [[ x"$SERVICE_WARM_START" == x"true" ]]; then
WARM_BOOT="true"
else
WARM_BOOT="false"
fi
}
function check_fast_boot()
{
SYSTEM_FAST_REBOOT=`sonic-db-cli STATE_DB hget "FAST_RESTART_ENABLE_TABLE|system" enable`
if [[ x"${SYSTEM_FAST_REBOOT}" == x"true" ]]; then
FAST_BOOT="true"
else
FAST_BOOT="false"
fi
}
function validate_restore_count()
{
if [[ x"$WARM_BOOT" == x"true" ]]; then
RESTORE_COUNT=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_TABLE|orchagent" restore_count`
# We have to make sure db data has not been flushed.
if [[ -z "$RESTORE_COUNT" ]]; then
WARM_BOOT="false"
fi
fi
}
function wait_for_database_service()
{
# Wait for redis server start before database clean
until [[ $($SONIC_DB_CLI PING | grep -c PONG) -gt 0 ]]; do
sleep 1;
done
# Wait for configDB initialization
Fix CONFIG_DB_INITIALIZED flag check logic and set/reset flag for warmboot (#15685) (#16217) Cherypick of #15685 MSFT ADO: 24274591 Why I did it Two changes: 1 Fix a day1 issue, where check to wait until CONFIG_DB_INITIALIZED is incorrect. There are multiple places where same incorrect logic is used. Current logic (until [[ $($SONIC_DB_CLI CONFIG_DB GET "CONFIG_DB_INITIALIZED") ]];) will always result in pass, irrespective of the result of GET operation. root@str2-7060cx-32s-29:~# sonic-db-cli CONFIG_DB GET "CONFIG_DB_INITIALIZED" 1 root@str2-7060cx-32s-29:~# until [[ $(sonic-db-cli CONFIG_DB GET "CONFIG_DB_INITIALIZED") ]]; do echo "entered here"; done root@str2-7060cx-32s-29:~# root@str2-7060cx-32s-29:~# root@str2-7060cx-32s-29:~# sonic-db-cli CONFIG_DB GET "CONFIG_DB_INITIALIZED" 0 root@str2-7060cx-32s-29:~# until [[ $(sonic-db-cli CONFIG_DB GET "CONFIG_DB_INITIALIZED") ]]; do echo "entered here"; done root@str2-7060cx-32s-29:~# Fix this logic by checking for value of flag to be "1". root@str2-7060cx-32s-29:~# until [[ $(sonic-db-cli CONFIG_DB GET "CONFIG_DB_INITIALIZED") -eq 1 ]]; do echo "entered here"; done entered here entered here entered here This gap in logic was highlighted when another fix was merged: #14933 The issue being fixed here caused warmboot-finalizer to not wait until config-db is initialized. 2 Set and unset CONFIG_DB_INITIALIZED for warm-reboot case Currently, during warm shutdown CONFIG_DB_INITIALIZED's value is stored in redis db backup. This is restored back when the dump is loaded during warm-recovery. So the value of CONFIG_DB_INITIALIZED does not depend on config db's state, however it remain what it was before reboot. Fix this by setting CONFIG_DB_INITIALIZED to 0 as when the DB is loaded, and set it to 1 after db_migrator is done. Work item tracking Microsoft ADO (number only): How I did it How to verify it
2023-08-24 03:58:24 -05:00
until [[ $($SONIC_DB_CLI CONFIG_DB GET "CONFIG_DB_INITIALIZED") -eq 1 ]];
do sleep 1;
done
}
# This function cleans up the tables with specific prefixes from the database
# $1 the index of the database
# $2 the string of a list of table prefixes
function clean_up_tables()
{
$SONIC_DB_CLI $1 EVAL "
local tables = {$2}
for i = 1, table.getn(tables) do
local matches = redis.call('KEYS', tables[i])
for j,name in ipairs(matches) do
redis.call('DEL', name)
end
end" 0
}
# This function cleans up the chassis db table entries created ONLY by this asic
# This is used to do the clean up operation when the line card / asic reboots
# When the asic/lc is RE-booting, the chassis db server is supposed to be running
# in the supervisor. So the clean up is done when only the chassis db connectable.
# Otherwise no need to do the clean up since both the supervisor and line card may be
# rebooting (the whole chassis scenario)
# The clean up operation is required to delete only those entries created by
# the asic that is rebooted. Entries from the following tables are deleted in the order
# given below
# (1) SYSTEM_NEIGH
# (2) SYSTEM_INTERFACE
# (3) SYSTEM_LAG_MEMBER_TABLE
# (4) SYSTEM_LAG_TABLE
# (5) The corresponding LAG IDs of the entries from SYSTEM_LAG_TABLE
# SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately
function clean_up_chassis_db_tables()
{
switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'`
# Run clean up only in swss running for voq switches
if is_chassis_supervisor || [[ $switch_type != 'voq' ]]; then
return
fi
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
return
fi
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
# First, delete SYSTEM_NEIGH entries
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nn = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
local keylist = redis.call('KEYS', 'SYSTEM_NEIGH*')
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
nn = nn + 1
end
end
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
return nn" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
# Wait for some time before deleting system interface so that the system interface's "object in use"
# is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount
# but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use"
# error and aborts.
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
# This delay is needed only if some system neighbors were deleted.
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
if [[ $num_neigh > 0 ]]; then
sleep 30
fi
# Next, delete SYSTEM_INTERFACE entries
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsi = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
local keylist = redis.call('KEYS', 'SYSTEM_INTERFACE*')
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
nsi = nsi + 1
end
end
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
return nsi" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nlm = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
local keylist = redis.call('KEYS', 'SYSTEM_LAG_MEMBER_TABLE*')
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
nlm = nlm + 1
end
end
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
return nlm" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
# Wait for some time before deleting system lag so that the all the memebers of the
# system lag will be cleared.
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
# This delay is needed only if some system lag members were deleted
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
if [[ $num_lag_mem > 0 ]]; then
sleep 15
fi
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsl = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
local keylist = redis.call('KEYS', 'SYSTEM_LAG_TABLE*')
for j,key in ipairs(keylist) do
local lagname = string.match(key, ps)
if lagname ~= nil then
redis.call('DEL', key)
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
nsl = nsl + 1
end
end
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
return nsl" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
}
start_peer_and_dependent_services() {
check_warm_boot
if [[ x"$WARM_BOOT" != x"true" ]]; then
for peer in ${PEER}; do
if [[ ! -z $DEV ]]; then
/bin/systemctl start ${peer}@$DEV
else
/bin/systemctl start ${peer}
fi
done
for dep in ${DEPENDENT}; do
/bin/systemctl start ${dep}
done
for dep in ${MULTI_INST_DEPENDENT}; do
if [[ ! -z $DEV ]]; then
/bin/systemctl start ${dep}@$DEV
else
/bin/systemctl start ${dep}
fi
done
fi
}
stop_peer_and_dependent_services() {
# if warm/fast start enabled or peer lock exists, don't stop peer service docker
if [[ x"$WARM_BOOT" != x"true" ]] && [[ x"$FAST_BOOT" != x"true" ]]; then
for dep in ${MULTI_INST_DEPENDENT}; do
if [[ ! -z $DEV ]]; then
/bin/systemctl stop ${dep}@$DEV
else
/bin/systemctl stop ${dep}
fi
done
for dep in ${DEPENDENT}; do
/bin/systemctl stop ${dep}
done
for peer in ${PEER}; do
if [[ ! -z $DEV ]]; then
/bin/systemctl stop ${peer}@$DEV
else
/bin/systemctl stop ${peer}
fi
done
fi
}
start() {
debug "Starting ${SERVICE}$DEV service..."
lock_service_state_change
wait_for_database_service
check_warm_boot
validate_restore_count
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
# Don't flush DB during warm boot
if [[ x"$WARM_BOOT" != x"true" ]]; then
debug "Flushing APP, ASIC, COUNTER, CONFIG, and partial STATE databases ..."
$SONIC_DB_CLI APPL_DB FLUSHDB
$SONIC_DB_CLI ASIC_DB FLUSHDB
$SONIC_DB_CLI COUNTERS_DB FLUSHDB
$SONIC_DB_CLI FLEX_COUNTER_DB FLUSHDB
$SONIC_DB_CLI GB_ASIC_DB FLUSHDB
$SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB
$SONIC_DB_CLI RESTAPI_DB FLUSHDB
[swss] Chassis db clean up optimization and bug fixes (#16454) (#16644) * [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <veda.ganesan@nokia.com> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <veda.ganesan@nokia.com> --------- Signed-off-by: vedganes <veda.ganesan@nokia.com> (cherry picked from commit b13b41fc220be1f670cccc41773049e731d9b970)
2023-09-21 21:58:27 -05:00
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*', 'VNET_ROUTE_TUNNEL_TABLE*', 'SYSTEM_NEIGH_TABLE*'"
$SONIC_DB_CLI APPL_STATE_DB FLUSHDB
clean_up_chassis_db_tables
rm -rf /tmp/cache
fi
# On supervisor card, skip starting asic related services here. In wait(),
# wait until the asic is detected by pmon and published via database.
if ! is_chassis_supervisor; then
# start service docker
/usr/bin/${SERVICE}.sh start $DEV
debug "Started ${SERVICE}$DEV service..."
fi
# Unlock has to happen before reaching out to peer service
unlock_service_state_change
}
wait() {
# On supervisor card, wait for asic to be online before starting the docker.
if is_chassis_supervisor; then
check_asic_status
ASIC_STATUS=$?
# start service docker
if [[ $ASIC_STATUS == 0 ]]; then
/usr/bin/${SERVICE}.sh start $DEV
debug "Started ${SERVICE}$DEV service..."
fi
fi
start_peer_and_dependent_services
# Allow some time for peer container to start
# NOTE: This assumes Docker containers share the same names as their
# corresponding services
for SECS in {1..60}; do
ALL_PEERS_RUNNING=true
for peer in ${PEER}; do
if [[ ! -z $DEV ]]; then
RUNNING=$(docker inspect -f '{{.State.Running}}' ${peer}$DEV)
else
RUNNING=$(docker inspect -f '{{.State.Running}}' ${peer})
fi
if [[ x"$RUNNING" != x"true" ]]; then
ALL_PEERS_RUNNING=false
break
fi
done
[docker-teamd]: Add teamd as a depedent service to swss (#5628) **- Why I did it** On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present. **- How I did it** Add the teamd as a dependent service for swss Updated the docker-wait script to handle service and dependent services separately. Handle the case of warm-restart for the dependent service **- How to verify it** Verified the following scenario's with the following testbed VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs 1. Stop teamd docker alone > swss, syncd dockers seen going away > The LAG reference count error messages seen for a while till swss docker stops. > Dockers back up. 2. Enable WR mode for teamd. Stop teamd docker alone > swss, syncd dockers not removed. > The LAG reference count error messages not seen > Repeated stop teamd docker test - same result, no effect on swss/syncd. 3. Stop swss docker. > swss, teamd, syncd goes off - dockers comes back correctly, interfaces up 4. Enable WR mode for swss . Stop swss docker > swss goes off not affecting syncd/teamd dockers. 5. Config reload > no reference counter error seen, dockers comes back correctly, with interfaces up 6. Warm reboot, observations below > swss docker goes off first > teamd + syncd goes off to the end of WR process. > dockers comes back up fine. > ping traffic between VM's was NOT HIT 7. Fast reboot, observations below > teamd goes off first ( **confirmed swss don't exit here** ) > swss goes off next > syncd goes away at the end of the FR process > dockers comes back up fine. > there is a traffic HIT as per fast-reboot 8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
ALL_DEPS_RUNNING=true
for dep in ${MULTI_INST_DEPENDENT}; do
if [[ ! -z $DEV ]]; then
DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep}$DEV)
else
DEP_RUNNING=$(docker inspect -f '{{.State.Running}}' ${dep})
fi
if [[ x"$DEP_RUNNING" != x"true" ]]; then
ALL_DEPS_RUNNING=false
break
fi
done
if [[ x"$ALL_PEERS_RUNNING" == x"true" && x"$ALL_DEPS_RUNNING" == x"true" ]]; then
break
else
sleep 1
fi
done
# NOTE: This assumes Docker containers share the same names as their
# corresponding services
[docker-teamd]: Add teamd as a depedent service to swss (#5628) **- Why I did it** On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present. **- How I did it** Add the teamd as a dependent service for swss Updated the docker-wait script to handle service and dependent services separately. Handle the case of warm-restart for the dependent service **- How to verify it** Verified the following scenario's with the following testbed VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs 1. Stop teamd docker alone > swss, syncd dockers seen going away > The LAG reference count error messages seen for a while till swss docker stops. > Dockers back up. 2. Enable WR mode for teamd. Stop teamd docker alone > swss, syncd dockers not removed. > The LAG reference count error messages not seen > Repeated stop teamd docker test - same result, no effect on swss/syncd. 3. Stop swss docker. > swss, teamd, syncd goes off - dockers comes back correctly, interfaces up 4. Enable WR mode for swss . Stop swss docker > swss goes off not affecting syncd/teamd dockers. 5. Config reload > no reference counter error seen, dockers comes back correctly, with interfaces up 6. Warm reboot, observations below > swss docker goes off first > teamd + syncd goes off to the end of WR process. > dockers comes back up fine. > ping traffic between VM's was NOT HIT 7. Fast reboot, observations below > teamd goes off first ( **confirmed swss don't exit here** ) > swss goes off next > syncd goes away at the end of the FR process > dockers comes back up fine. > there is a traffic HIT as per fast-reboot 8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
for dep in ${MULTI_INST_DEPENDENT}; do
if [[ ! -z $DEV ]]; then
ALL_DEPS="$ALL_DEPS ${dep}$DEV"
else
ALL_DEPS="$ALL_DEPS ${dep}"
fi
done
if [[ ! -z $DEV ]]; then
/usr/bin/docker-wait-any -s ${SERVICE}$DEV -d `printf "%s$DEV " ${PEER}` ${ALL_DEPS}
else
[docker-teamd]: Add teamd as a depedent service to swss (#5628) **- Why I did it** On teamd docker restart, the swss and syncd needs to be restarted as there are dependent resources present. **- How I did it** Add the teamd as a dependent service for swss Updated the docker-wait script to handle service and dependent services separately. Handle the case of warm-restart for the dependent service **- How to verify it** Verified the following scenario's with the following testbed VM1 ----------------------------[DUT 6100] -----------------------VM2, ping traffic continuous between VMs 1. Stop teamd docker alone > swss, syncd dockers seen going away > The LAG reference count error messages seen for a while till swss docker stops. > Dockers back up. 2. Enable WR mode for teamd. Stop teamd docker alone > swss, syncd dockers not removed. > The LAG reference count error messages not seen > Repeated stop teamd docker test - same result, no effect on swss/syncd. 3. Stop swss docker. > swss, teamd, syncd goes off - dockers comes back correctly, interfaces up 4. Enable WR mode for swss . Stop swss docker > swss goes off not affecting syncd/teamd dockers. 5. Config reload > no reference counter error seen, dockers comes back correctly, with interfaces up 6. Warm reboot, observations below > swss docker goes off first > teamd + syncd goes off to the end of WR process. > dockers comes back up fine. > ping traffic between VM's was NOT HIT 7. Fast reboot, observations below > teamd goes off first ( **confirmed swss don't exit here** ) > swss goes off next > syncd goes away at the end of the FR process > dockers comes back up fine. > there is a traffic HIT as per fast-reboot 8. Verified in multi-asic platform, the tests above other than WR/FB scenarios
2020-10-23 02:41:16 -05:00
/usr/bin/docker-wait-any -s ${SERVICE} -d ${PEER} ${ALL_DEPS}
fi
}
stop() {
debug "Stopping ${SERVICE}$DEV service..."
[[ -f ${LOCKFILE} ]] || /usr/bin/touch ${LOCKFILE}
lock_service_state_change
check_warm_boot
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
check_fast_boot
debug "Fast boot flag: ${SERVICE}$DEV ${FAST_BOOT}."
# For WARM/FAST boot do not perform service stop
if [[ x"$WARM_BOOT" != x"true" ]] && [[ x"$FAST_BOOT" != x"true" ]]; then
/usr/bin/${SERVICE}.sh stop $DEV
debug "Stopped ${SERVICE}$DEV service..."
$SONIC_DB_CLI APPL_DB DEL PORT_TABLE:PortInitDone
debug "Cleared PortInitDone from APPL_DB..."
else
debug "Killing Docker swss..."
/usr/bin/docker kill swss &> /dev/null || debug "Docker swss is not running ($?) ..."
fi
# Flush FAST_REBOOT table when swss needs to stop. The only
# time when this would take effect is when fast-reboot
# encountered error, e.g. syncd crashed. And swss needs to
# be restarted.
if [[ x"$FAST_BOOT" != x"true" ]]; then
debug "Clearing FAST_RESTART_ENABLE_TABLE flag..."
sonic-db-cli STATE_DB hset "FAST_RESTART_ENABLE_TABLE|system" "enable" "false"
fi
# Unlock has to happen before reaching out to peer service
unlock_service_state_change
stop_peer_and_dependent_services
}
function check_peer_gbsyncd()
{
GEARBOX_CONFIG=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/gearbox_config.json
if [ -f $GEARBOX_CONFIG ]; then
PEER="$PEER gbsyncd"
fi
}
function check_macsec()
{
MACSEC_STATE=`$SONIC_DB_CLI CONFIG_DB hget 'FEATURE|macsec' state`
if [[ ${MACSEC_STATE} == 'enabled' ]]; then
if [ "$DEV" ]; then
DEPENDENT="${DEPENDENT} macsec@${DEV}"
else
DEPENDENT="${DEPENDENT} macsec"
fi
fi
}
function check_add_bgp_dependency()
{
if ! is_chassis_supervisor; then
if [ "$DEV" ]; then
DEPENDENT="${DEPENDENT} bgp@${DEV}"
else
DEPENDENT="${DEPENDENT} bgp"
fi
fi
}
function check_ports_present()
{
PORT_CONFIG_INI=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/port_config.ini
HWSKU_JSON=/usr/share/sonic/device/$PLATFORM/$HWSKU/$DEV/hwsku.json
if [[ -f $PORT_CONFIG_INI ]] || [[ -f $HWSKU_JSON ]]; then
return 0
fi
return 1
}
function check_service_exists()
{
systemctl list-units --full -all 2>/dev/null | grep -Fq $1
if [[ $? -eq 0 ]]; then
echo true
return
else
echo false
return
fi
}
# DEPENDENT initially contains namespace independent services
# namespace specific services are added later in this script.
DEPENDENT=""
MULTI_INST_DEPENDENT=""
if [[ $(check_service_exists radv) == "true" ]]; then
DEPENDENT="$DEPENDENT radv"
fi
if [ "$DEV" ]; then
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
else
NET_NS=""
SONIC_DB_CLI="sonic-db-cli"
fi
PLATFORM=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' platform`
HWSKU=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' hwsku`
check_peer_gbsyncd
check_macsec
check_add_bgp_dependency
check_ports_present
PORTS_PRESENT=$?
if [[ $PORTS_PRESENT == 0 ]] && [[ $(check_service_exists teamd) == "true" ]]; then
MULTI_INST_DEPENDENT="teamd"
fi
read_dependent_services
case "$1" in
start|wait|stop)
$1
;;
*)
echo "Usage: $0 {start|wait|stop}"
exit 1
;;
esac