[swss] Chassis db clean up optimization and bug fixes (#16454)

* [swss] Chassis db clean up optimization and bug fixes

This commit includes the following changes:
    - Fix for regression failure due to error in finding CHASSIS_APP_DB in
    pizzabox (#PR 16451)
    - After attempting to delete the system neighbor entries from
    chassis db, before starting clearing the system interface entries,
    wait for sometime only if some system neighbors were deleted.
    If there are no system neighbors entries deleted for the asic coming up,
    no need to wait.
    - Similar changes for system lag delete. Before deleting the
    system lag, wait for some time only if some system lag memebers were
    deleted. If there are no system lag members deleted no need to wait.
    - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic
    is coming up, when system neigh entries are deleted from chassis ap
    db (as part of chassis db clean up), there is no orchs/process running to
    process the delete messages from chassis redis. Because of this, stale system
    neigh are entries present in the local STATE_DB. The stale entries result in
    creation of orphan (no corresponding data path/asic db entry) kernel neigh
    entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after
    the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from
    the local STATE_DB when sevice comes up.

Signed-off-by: vedganes <veda.ganesan@nokia.com>

* [swss] Chassis db clean up bug fixes review comment fix - 1

Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE)

Signed-off-by: vedganes <veda.ganesan@nokia.com>

---------

Signed-off-by: vedganes <veda.ganesan@nokia.com>
This commit is contained in:
vganesan-nokia 2023-09-11 11:28:27 -04:00 committed by GitHub
parent 9c1c82e9ff
commit b13b41fc22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -124,12 +124,7 @@ function clean_up_tables()
# SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately # SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately
function clean_up_chassis_db_tables() function clean_up_chassis_db_tables()
{ {
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
return
fi
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'` switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'`
# Run clean up only in swss running for voq switches # Run clean up only in swss running for voq switches
@ -137,8 +132,16 @@ function clean_up_chassis_db_tables()
return return
fi fi
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
return
fi
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
# First, delete SYSTEM_NEIGH entries # First, delete SYSTEM_NEIGH entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL " num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nn = 0
local host = string.gsub(ARGV[1], '%-', '%%-') local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2] local dev = ARGV[2]
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
@ -146,19 +149,26 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then if string.match(key, ps) ~= nil then
redis.call('DEL', key) redis.call('DEL', key)
nn = nn + 1
end end
end end
return " 0 $lc $asic return nn" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
# Wait for some time before deleting system interface so that the system interface's "object in use" # Wait for some time before deleting system interface so that the system interface's "object in use"
# is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount # is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount
# but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use" # but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use"
# error and aborts. # error and aborts.
# This delay is needed only if some system neighbors were deleted.
if [[ $num_neigh > 0 ]]; then
sleep 30 sleep 30
fi
# Next, delete SYSTEM_INTERFACE entries # Next, delete SYSTEM_INTERFACE entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL " num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsi = 0
local host = string.gsub(ARGV[1], '%-', '%%-') local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2] local dev = ARGV[2]
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
@ -166,12 +176,16 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then if string.match(key, ps) ~= nil then
redis.call('DEL', key) redis.call('DEL', key)
nsi = nsi + 1
end end
end end
return " 0 $lc $asic return nsi" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries # Next, delete SYSTEM_LAG_MEMBER_TABLE entries
$SONIC_DB_CLI CHASSIS_APP_DB EVAL " num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nlm = 0
local host = string.gsub(ARGV[1], '%-', '%%-') local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2] local dev = ARGV[2]
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
@ -179,17 +193,24 @@ function clean_up_chassis_db_tables()
for j,key in ipairs(keylist) do for j,key in ipairs(keylist) do
if string.match(key, ps) ~= nil then if string.match(key, ps) ~= nil then
redis.call('DEL', key) redis.call('DEL', key)
nlm = nlm + 1
end end
end end
return " 0 $lc $asic return nlm" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
# Wait for some time before deleting system lag so that the all the memebers of the # Wait for some time before deleting system lag so that the all the memebers of the
# system lag will be cleared. # system lag will be cleared.
# This delay is needed only if some system lag members were deleted
if [[ $num_lag_mem > 0 ]]; then
sleep 15 sleep 15
fi
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs # Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
$SONIC_DB_CLI CHASSIS_APP_DB EVAL " num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
local nsl = 0
local host = string.gsub(ARGV[1], '%-', '%%-') local host = string.gsub(ARGV[1], '%-', '%%-')
local dev = ARGV[2] local dev = ARGV[2]
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')' local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
@ -201,9 +222,12 @@ function clean_up_chassis_db_tables()
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname) local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid) redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname) redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
nsl = nsl + 1
end end
end end
return " 0 $lc $asic return nsl" 0 $lc $asic`
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
} }
@ -275,7 +299,7 @@ start() {
$SONIC_DB_CLI GB_ASIC_DB FLUSHDB $SONIC_DB_CLI GB_ASIC_DB FLUSHDB
$SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB $SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB
$SONIC_DB_CLI RESTAPI_DB FLUSHDB $SONIC_DB_CLI RESTAPI_DB FLUSHDB
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*'" clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*','SYSTEM_NEIGH_TABLE*'"
$SONIC_DB_CLI APPL_STATE_DB FLUSHDB $SONIC_DB_CLI APPL_STATE_DB FLUSHDB
clean_up_chassis_db_tables clean_up_chassis_db_tables
rm -rf /tmp/cache rm -rf /tmp/cache