This repository has been archived on 2025-03-20. You can view files and clone it, but cannot push or open issues or pull requests.
sonic-buildimage/files/scripts/syncd.sh
Vivek 7781399bb6
[201911][Mellanox] Collect MST dump before syncd restart on shutdown notification (#11742)
- Why I did it
Collecting MST dump before syncd restart on shutdown notification during a SAI failure

Dump can be found under:
root@sonic:/home/admin# ls -l /var/dump/mstdump/
total 10684
-rw-r--r-- 1 root root 5460332 Aug 15 18:41 mstdump_20220815_184143.tar.gz
-rw-r--r-- 1 root root 5473253 Aug 15 21:46 mstdump_20220815_214642.tar.gz

root@sonic:/home/admin# tar -xvzf /var/dump/mstdump/mstdump_20220815_214642.tar.gz
├── ir-gdb
│   └── core
└── mstdump
    ├── mstdump1
    ├── mstdump2
    ├── mstdump3
    └── mststatus

- How I did it
Checked for shutdown notification log in sairedis and used it to determine whether the shutdown is normal or due to SAI failure

- How to verify it
Simulated a SAI failure event and verified it. Verified it also on different reboots and config reload scenarios the dump is not generated

Signed-off-by: Vivek Reddy <vkarri@nvidia.com>
2022-08-29 16:09:26 +03:00

257 lines
6.9 KiB
Bash
Executable File

#!/bin/bash
function debug()
{
/usr/bin/logger $1
/bin/echo `date` "- $1" >> ${DEBUGLOG}
}
function lock_service_state_change()
{
debug "Locking ${LOCKFILE} from ${SERVICE}$DEV service"
exec {LOCKFD}>${LOCKFILE}
/usr/bin/flock -x ${LOCKFD}
trap "/usr/bin/flock -u ${LOCKFD}" 0 2 3 15
debug "Locked ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
}
function unlock_service_state_change()
{
debug "Unlocking ${LOCKFILE} (${LOCKFD}) from ${SERVICE}$DEV service"
/usr/bin/flock -u ${LOCKFD}
}
function check_warm_boot()
{
SYSTEM_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|system" enable`
SERVICE_WARM_START=`$SONIC_DB_CLI STATE_DB hget "WARM_RESTART_ENABLE_TABLE|${SERVICE}" enable`
# SYSTEM_WARM_START could be empty, always make WARM_BOOT meaningful.
if [[ x"$SYSTEM_WARM_START" == x"true" ]] || [[ x"$SERVICE_WARM_START" == x"true" ]]; then
WARM_BOOT="true"
else
WARM_BOOT="false"
fi
}
function wait_for_database_service()
{
# Wait for redis server start before database clean
until [[ $($SONIC_DB_CLI PING | grep -c PONG) -gt 0 ]]; do
sleep 1;
done
# Wait for configDB initialization
until [[ $($SONIC_DB_CLI CONFIG_DB GET "CONFIG_DB_INITIALIZED") ]];
do sleep 1;
done
}
function getBootType()
{
# same code snippet in files/build_templates/docker_image_ctl.j2
case "$(cat /proc/cmdline)" in
*SONIC_BOOT_TYPE=warm*)
TYPE='warm'
;;
*SONIC_BOOT_TYPE=fastfast*)
TYPE='fastfast'
;;
*SONIC_BOOT_TYPE=fast*|*fast-reboot*)
# check that the key exists
if [[ $($SONIC_DB_CLI STATE_DB GET "FAST_REBOOT|system") == "1" ]]; then
TYPE='fast'
else
TYPE='cold'
fi
;;
*)
TYPE='cold'
esac
echo "${TYPE}"
}
start() {
debug "Starting ${SERVICE}$DEV service..."
lock_service_state_change
mkdir -p /host/warmboot
wait_for_database_service
check_warm_boot
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
if [[ x"$WARM_BOOT" == x"true" ]]; then
# Leave a mark for syncd scripts running inside docker.
touch /host/warmboot/warm-starting
else
rm -f /host/warmboot/warm-starting
fi
# platform specific tasks
# start mellanox drivers regardless of
# boot type
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
BOOT_TYPE=`getBootType`
if [[ x"$WARM_BOOT" == x"true" || x"$BOOT_TYPE" == x"fast" ]]; then
export FAST_BOOT=1
fi
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [[ x"$(/bin/systemctl is-active pmon)" == x"active" ]]; then
/bin/systemctl stop pmon
debug "pmon is active while syncd starting, stop it first"
fi
fi
/usr/bin/mst start --with_i2cdev
/usr/bin/mlnx-fw-upgrade.sh
/etc/init.d/sxdkernel start
fi
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [ x$sonic_asic_platform == x'cavium' ]; then
/etc/init.d/xpnet.sh start
fi
fi
# start service docker
/usr/bin/${SERVICE}.sh start $DEV
debug "Started ${SERVICE} service..."
unlock_service_state_change
}
wait() {
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
debug "Starting pmon service..."
/bin/systemctl start pmon
debug "Started pmon service"
fi
/usr/bin/${SERVICE}.sh wait $DEV
}
collect_mst() {
debug "Collecting MST dump before syncd restart"
TMPDIR=/tmp/mlnxdump
MSTDIR=$TMPDIR/mstdump
DUMPDIR=/var/dump/mstdump
mkdir -p $MSTDIR
if ! mst status -v &> $MSTDIR/mststatus; then
debug "mst status command returned error"
else
local mst_dump_filename="$MSTDIR/mstdump"
local max_dump_count="3"
for i in $(seq 1 $max_dump_count); do
if ! ${CMD_PREFIX}/usr/bin/mstdump /dev/mst/mt*pci_cr0 > "${mst_dump_filename}${i}"; then
debug "mstdump failed"
break
fi
done
fi
mkdir -p $DUMPDIR
TARFILE=mstdump_`date +%Y%m%d_%H%M%S`.tar
tar -C $TMPDIR -cf $DUMPDIR/$TARFILE .
gzip -f $DUMPDIR/$TARFILE
debug "MST dump created $DUMPDIR/$TARFILE.gz"
rm -rf $TMPDIR
# Maintaining the recent 3 files and removing the rest
ls -1td $DUMPDIR/* | tail -n +4 | xargs rm -rf
}
stop() {
debug "Stopping ${SERVICE}$DEV service..."
lock_service_state_change
check_warm_boot
debug "Warm boot flag: ${SERVICE}$DEV ${WARM_BOOT}."
if [[ x"$WARM_BOOT" == x"true" ]]; then
TYPE=warm
else
TYPE=cold
fi
if [[ x$sonic_asic_platform == x"mellanox" ]] && [[ x$TYPE == x"cold" ]]; then
debug "Stopping pmon service ahead of syncd..."
/bin/systemctl stop pmon
debug "Stopped pmon service"
fi
if [[ x$sonic_asic_platform == x"mellanox" ]]; then
# In case of SAI failure the last line of sairedis.rec would contain switch_shutdown_request
if tail -1 /var/log/swss/sairedis.rec | grep -q switch_shutdown_request; then
collect_mst
fi
fi
if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then
debug "${TYPE} shutdown syncd process ..."
/usr/bin/docker exec -i syncd$DEV /usr/bin/syncd_request_shutdown --${TYPE}
# wait until syncd quits gracefully or force syncd to exit after
# waiting for 20 seconds
start_in_secs=${SECONDS}
end_in_secs=${SECONDS}
timer_threshold=20
while docker top syncd$DEV | grep -q /usr/bin/syncd \
&& [[ $((end_in_secs - start_in_secs)) -le $timer_threshold ]]; do
sleep 0.1
end_in_secs=${SECONDS}
done
if [[ $((end_in_secs - start_in_secs)) -gt $timer_threshold ]]; then
debug "syncd process in container syncd$DEV did not exit gracefully"
fi
/usr/bin/docker exec -i syncd$DEV /bin/sync
debug "Finished ${TYPE} shutdown syncd process ..."
fi
/usr/bin/${SERVICE}.sh stop $DEV
debug "Stopped ${SERVICE}$DEV service..."
# platform specific tasks
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [ x$sonic_asic_platform == x'mellanox' ]; then
/etc/init.d/sxdkernel stop
/usr/bin/mst stop
elif [ x$sonic_asic_platform == x'cavium' ]; then
/etc/init.d/xpnet.sh stop
/etc/init.d/xpnet.sh start
fi
fi
unlock_service_state_change
}
OP=$1
DEV=$2
SERVICE="syncd"
PEER="swss"
DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
LOCKFILE="/tmp/swss-syncd-lock$DEV"
NAMESPACE_PREFIX="asic"
if [ "$DEV" ]; then
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
else
NET_NS=""
SONIC_DB_CLI="sonic-db-cli"
fi
case "$1" in
start|wait|stop)
$1
;;
*)
echo "Usage: $0 {start|wait|stop}"
exit 1
;;
esac