- Why I did it Collecting MST dump before syncd restart on shutdown notification during a SAI failure Dump can be found under: root@sonic:/home/admin# ls -l /var/dump/mstdump/ total 10684 -rw-r--r-- 1 root root 5460332 Aug 15 18:41 mstdump_20220815_184143.tar.gz -rw-r--r-- 1 root root 5473253 Aug 15 21:46 mstdump_20220815_214642.tar.gz root@sonic:/home/admin# tar -xvzf /var/dump/mstdump/mstdump_20220815_214642.tar.gz ├── ir-gdb │ └── core └── mstdump ├── mstdump1 ├── mstdump2 ├── mstdump3 └── mststatus - How I did it Checked for shutdown notification log in sairedis and used it to determine whether the shutdown is normal or due to SAI failure - How to verify it Simulated a SAI failure event and verified it. Verified it also on different reboots and config reload scenarios the dump is not generated Signed-off-by: Vivek Reddy <vkarri@nvidia.com>
257 lines
6.9 KiB
Bash
Executable File
257 lines
6.9 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
|
|
function debug()
|
|
{
|
|
/usr/bin/logger $1
|
|
/bin/echo `date` "- $1" >> ${DEBUGLOG}
|
|
}
|
|
|
|
function lock_service_state_change()
|
|
{
|
|
debug "Locking ${LOCKFILE} from ${SERVICE}$DEV service"
|
|
|
|
exec {LOCKFD}>${LOCKFILE}
|
|
/usr/bin/flock -x ${LOCKFD}
|
|
trap "/usr/bin/flock -u ${LOCKFD}" 0 2 3 15
|
|
|
|
debug "Locked ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
|
|
}
|
|
|
|
function unlock_service_state_change()
|
|
{
|
|
debug "Unlocking ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
|
|
/usr/bin/flock -u ${LOCKFD}
|
|
}
|
|
|
|
function check_warm_boot()
|
|
{
|
|
SYSTEM_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|system" enable`
|
|
SERVICE_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|${SERVICE}" enable`
|
|
# SYSTEM_WARM_START could be empty, always make WARM_BOOT meaningful.
|
|
if [[ x"$SYSTEM_WARM_START" == x"true" ]] || [[ x"$SERVICE_WARM_START" == x"true" ]]; then
|
|
WARM_BOOT="true"
|
|
else
|
|
WARM_BOOT="false"
|
|
fi
|
|
}
|
|
|
|
function wait_for_database_service()
|
|
{
|
|
# Wait for redis server start before database clean
|
|
until [[ $($SONIC_DB_CLI PING | grep -c PONG) -gt 0 ]]; do
|
|
sleep 1;
|
|
done
|
|
|
|
# Wait for configDB initialization
|
|
until [[ $($SONIC_DB_CLI CONFIG_DB GET "CONFIG_DB_INITIALIZED") ]];
|
|
do sleep 1;
|
|
done
|
|
}
|
|
|
|
function getBootType()
|
|
{
|
|
# same code snippet in files/build_templates/docker_image_ctl.j2
|
|
case "$(cat /proc/cmdline)" in
|
|
*SONIC_BOOT_TYPE=warm*)
|
|
TYPE='warm'
|
|
;;
|
|
*SONIC_BOOT_TYPE=fastfast*)
|
|
TYPE='fastfast'
|
|
;;
|
|
*SONIC_BOOT_TYPE=fast*|*fast-reboot*)
|
|
# check that the key exists
|
|
if [[ $($SONIC_DB_CLI STATE_DB GET "FAST_REBOOT|system") == "1" ]]; then
|
|
TYPE='fast'
|
|
else
|
|
TYPE='cold'
|
|
fi
|
|
;;
|
|
*)
|
|
TYPE='cold'
|
|
esac
|
|
echo "${TYPE}"
|
|
}
|
|
|
|
start() {
|
|
debug "Starting ${SERVICE}$DEV service..."
|
|
|
|
lock_service_state_change
|
|
|
|
mkdir -p /host/warmboot
|
|
|
|
wait_for_database_service
|
|
check_warm_boot
|
|
|
|
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
|
|
|
|
if [[ x"$WARM_BOOT" == x"true" ]]; then
|
|
# Leave a mark for syncd scripts running inside docker.
|
|
touch /host/warmboot/warm-starting
|
|
else
|
|
rm -f /host/warmboot/warm-starting
|
|
fi
|
|
|
|
# platform specific tasks
|
|
|
|
# start mellanox drivers regardless of
|
|
# boot type
|
|
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
|
|
BOOT_TYPE=`getBootType`
|
|
if [[ x"$WARM_BOOT" == x"true" || x"$BOOT_TYPE" == x"fast" ]]; then
|
|
export FAST_BOOT=1
|
|
fi
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]]; then
|
|
if [[ x"$(/bin/systemctl is-active pmon)" == x"active" ]]; then
|
|
/bin/systemctl stop pmon
|
|
debug "pmon is active while syncd starting, stop it first"
|
|
fi
|
|
fi
|
|
|
|
/usr/bin/mst start --with_i2cdev
|
|
/usr/bin/mlnx-fw-upgrade.sh
|
|
/etc/init.d/sxdkernel start
|
|
fi
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]]; then
|
|
if [ x$sonic_asic_platform == x'cavium' ]; then
|
|
/etc/init.d/xpnet.sh start
|
|
fi
|
|
fi
|
|
|
|
# start service docker
|
|
/usr/bin/${SERVICE}.sh start $DEV
|
|
debug "Started ${SERVICE} service..."
|
|
|
|
unlock_service_state_change
|
|
}
|
|
|
|
wait() {
|
|
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
|
|
debug "Starting pmon service..."
|
|
/bin/systemctl start pmon
|
|
debug "Started pmon service"
|
|
fi
|
|
/usr/bin/${SERVICE}.sh wait $DEV
|
|
}
|
|
|
|
collect_mst() {
|
|
debug "Collecting MST dump before syncd restart"
|
|
TMPDIR=/tmp/mlnxdump
|
|
MSTDIR=$TMPDIR/mstdump
|
|
DUMPDIR=/var/dump/mstdump
|
|
mkdir -p $MSTDIR
|
|
if ! mst status -v &> $MSTDIR/mststatus; then
|
|
debug "mst status command returned error"
|
|
else
|
|
local mst_dump_filename="$MSTDIR/mstdump"
|
|
local max_dump_count="3"
|
|
for i in $(seq 1 $max_dump_count); do
|
|
if ! ${CMD_PREFIX}/usr/bin/mstdump /dev/mst/mt*pci_cr0 > "${mst_dump_filename}${i}"; then
|
|
debug "mstdump failed"
|
|
break
|
|
fi
|
|
done
|
|
fi
|
|
mkdir -p $DUMPDIR
|
|
TARFILE=mstdump_`date +%Y%m%d_%H%M%S`.tar
|
|
tar -C $TMPDIR -cf $DUMPDIR/$TARFILE .
|
|
gzip -f $DUMPDIR/$TARFILE
|
|
debug "MST dump created $DUMPDIR/$TARFILE.gz"
|
|
rm -rf $TMPDIR
|
|
# Maintaining the recent 3 files and removing the rest
|
|
ls -1td $DUMPDIR/* | tail -n +4 | xargs rm -rf
|
|
}
|
|
|
|
stop() {
|
|
debug "Stopping ${SERVICE}$DEV service..."
|
|
|
|
lock_service_state_change
|
|
check_warm_boot
|
|
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
|
|
|
|
if [[ x"$WARM_BOOT" == x"true" ]]; then
|
|
TYPE=warm
|
|
else
|
|
TYPE=cold
|
|
fi
|
|
|
|
if [[ x$sonic_asic_platform == x"mellanox" ]] && [[ x$TYPE == x"cold" ]]; then
|
|
debug "Stopping pmon service ahead of syncd..."
|
|
/bin/systemctl stop pmon
|
|
debug "Stopped pmon service"
|
|
fi
|
|
if [[ x$sonic_asic_platform == x"mellanox" ]]; then
|
|
# In case of SAI failure the last line of sairedis.rec would contain switch_shutdown_request
|
|
if tail -1 /var/log/swss/sairedis.rec | grep -q switch_shutdown_request; then
|
|
collect_mst
|
|
fi
|
|
fi
|
|
if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then
|
|
debug "${TYPE} shutdown syncd process ..."
|
|
/usr/bin/docker exec -i syncd$DEV /usr/bin/syncd_request_shutdown --${TYPE}
|
|
|
|
# wait until syncd quits gracefully or force syncd to exit after
|
|
# waiting for 20 seconds
|
|
start_in_secs=${SECONDS}
|
|
end_in_secs=${SECONDS}
|
|
timer_threshold=20
|
|
while docker top syncd$DEV | grep -q /usr/bin/syncd \
|
|
&& [[ $((end_in_secs - start_in_secs)) -le $timer_threshold ]]; do
|
|
sleep 0.1
|
|
end_in_secs=${SECONDS}
|
|
done
|
|
|
|
if [[ $((end_in_secs - start_in_secs)) -gt $timer_threshold ]]; then
|
|
debug "syncd process in container syncd$DEV did not exit gracefully"
|
|
fi
|
|
|
|
/usr/bin/docker exec -i syncd$DEV /bin/sync
|
|
debug "Finished ${TYPE} shutdown syncd process ..."
|
|
fi
|
|
|
|
/usr/bin/${SERVICE}.sh stop $DEV
|
|
debug "Stopped ${SERVICE}$DEV service..."
|
|
|
|
# platform specific tasks
|
|
|
|
if [[ x"$WARM_BOOT" != x"true" ]]; then
|
|
if [ x$sonic_asic_platform == x'mellanox' ]; then
|
|
/etc/init.d/sxdkernel stop
|
|
/usr/bin/mst stop
|
|
elif [ x$sonic_asic_platform == x'cavium' ]; then
|
|
/etc/init.d/xpnet.sh stop
|
|
/etc/init.d/xpnet.sh start
|
|
fi
|
|
fi
|
|
|
|
unlock_service_state_change
|
|
}
|
|
|
|
OP=$1
|
|
DEV=$2
|
|
|
|
SERVICE="syncd"
|
|
PEER="swss"
|
|
DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
|
|
LOCKFILE="/tmp/swss-syncd-lock$DEV"
|
|
NAMESPACE_PREFIX="asic"
|
|
if [ "$DEV" ]; then
|
|
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
|
|
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
|
|
else
|
|
NET_NS=""
|
|
SONIC_DB_CLI="sonic-db-cli"
|
|
fi
|
|
|
|
case "$1" in
|
|
start|wait|stop)
|
|
$1
|
|
;;
|
|
*)
|
|
echo "Usage: $0 {start|wait|stop}"
|
|
exit 1
|
|
;;
|
|
esac
|