[swss] Chassis db clean up optimization and bug fixes (#16454)

* [swss] Chassis db clean up optimization and bug fixes

This commit includes the following changes:
    - Fix for regression failure due to error in finding CHASSIS_APP_DB in
    pizzabox (#PR 16451)
    - After attempting to delete the system neighbor entries from
    chassis db, before starting clearing the system interface entries,
    wait for sometime only if some system neighbors were deleted.
    If there are no system neighbors entries deleted for the asic coming up,
    no need to wait.
    - Similar changes for system lag delete. Before deleting the
    system lag, wait for some time only if some system lag memebers were
    deleted. If there are no system lag members deleted no need to wait.
    - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic
    is coming up, when system neigh entries are deleted from chassis ap
    db (as part of chassis db clean up), there is no orchs/process running to
    process the delete messages from chassis redis. Because of this, stale system
    neigh are entries present in the local STATE_DB. The stale entries result in
    creation of orphan (no corresponding data path/asic db entry) kernel neigh
    entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after
    the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from
    the local STATE_DB when sevice comes up.

Signed-off-by: vedganes <veda.ganesan@nokia.com>

* [swss] Chassis db clean up bug fixes review comment fix - 1

Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE)

Signed-off-by: vedganes <veda.ganesan@nokia.com>

---------

Signed-off-by: vedganes <veda.ganesan@nokia.com>
This commit is contained in:
vganesan-nokia 2023-09-11 11:28:27 -04:00 committed by GitHub
parent 9c1c82e9ff
commit b13b41fc22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -124,12 +124,7 @@ function clean_up_tables()
# SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately
function clean_up_chassis_db_tables()
{
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
return
fi
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'`
# Run clean up only in swss running for voq switches
@ -137,8 +132,16 @@ function clean_up_chassis_db_tables()
return
fi
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
return
fi
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
# First, delete SYSTEM_NEIGH entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nn = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
@ -146,19 +149,26 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
nn = nn + 1
end
end
return " 0 $lc $asic
return nn" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
# Wait for some time before deleting system interface so that the system interface's "object in use"
# is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount
# but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use"
# error and aborts.
# This delay is needed only if some system neighbors were deleted.
if [[ $num_neigh > 0 ]]; then
sleep 30
fi
# Next, delete SYSTEM_INTERFACE entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsi = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
@ -166,12 +176,16 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
nsi = nsi + 1
end
end
return " 0 $lc $asic
return nsi" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nlm = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
@ -179,17 +193,24 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then
redis.call('DEL', key)
nlm = nlm + 1
end
end
return " 0 $lc $asic
return nlm" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
# Wait for some time before deleting system lag so that the all the memebers of the
# system lag will be cleared.
# This delay is needed only if some system lag members were deleted
if [[ $num_lag_mem > 0 ]]; then
sleep 15
fi
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsl = 0
local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2]
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
@ -201,9 +222,12 @@ function clean_up_chassis_db_tables()
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
nsl = nsl + 1
end
end
return " 0 $lc $asic
return nsl" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
}
@ -275,7 +299,7 @@ start() {
$SONIC_DB_CLI GB_ASIC_DB FLUSHDB
$SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB
$SONIC_DB_CLI RESTAPI_DB FLUSHDB
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*'"
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*','SYSTEM_NEIGH_TABLE*'"
$SONIC_DB_CLI APPL_STATE_DB FLUSHDB
clean_up_chassis_db_tables
rm -rf /tmp/cache