[201911][Mellanox] Collect MST dump before syncd restart on shutdown notification (#11742)

- Why I did it
Collecting MST dump before syncd restart on shutdown notification during a SAI failure

Dump can be found under:
root@sonic:/home/admin# ls -l /var/dump/mstdump/
total 10684
-rw-r--r-- 1 root root 5460332 Aug 15 18:41 mstdump_20220815_184143.tar.gz
-rw-r--r-- 1 root root 5473253 Aug 15 21:46 mstdump_20220815_214642.tar.gz

root@sonic:/home/admin# tar -xvzf /var/dump/mstdump/mstdump_20220815_214642.tar.gz
├── ir-gdb
│   └── core
└── mstdump
    ├── mstdump1
    ├── mstdump2
    ├── mstdump3
    └── mststatus

- How I did it
Checked for shutdown notification log in sairedis and used it to determine whether the shutdown is normal or due to SAI failure

- How to verify it
Simulated a SAI failure event and verified it. Verified it also on different reboots and config reload scenarios the dump is not generated

Signed-off-by: Vivek Reddy <vkarri@nvidia.com>
This commit is contained in:
Vivek 2022-08-29 06:09:26 -07:00 committed by GitHub
parent 61a34fcf22
commit 7781399bb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -136,6 +136,34 @@ wait() {
/usr/bin/${SERVICE}.sh wait $DEV
}
collect_mst() {
debug "Collecting MST dump before syncd restart"
TMPDIR=/tmp/mlnxdump
MSTDIR=$TMPDIR/mstdump
DUMPDIR=/var/dump/mstdump
mkdir -p $MSTDIR
if ! mst status -v &> $MSTDIR/mststatus; then
debug "mst status command returned error"
else
local mst_dump_filename="$MSTDIR/mstdump"
local max_dump_count="3"
for i in $(seq 1 $max_dump_count); do
if ! ${CMD_PREFIX}/usr/bin/mstdump /dev/mst/mt*pci_cr0 > "${mst_dump_filename}${i}"; then
debug "mstdump failed"
break
fi
done
fi
mkdir -p $DUMPDIR
TARFILE=mstdump_`date +%Y%m%d_%H%M%S`.tar
tar -C $TMPDIR -cf $DUMPDIR/$TARFILE .
gzip -f $DUMPDIR/$TARFILE
debug "MST dump created $DUMPDIR/$TARFILE.gz"
rm -rf $TMPDIR
# Maintaining the recent 3 files and removing the rest
ls -1td $DUMPDIR/* | tail -n +4 | xargs rm -rf
}
stop() {
debug "Stopping ${SERVICE}$DEV service..."
@ -154,7 +182,12 @@ stop() {
/bin/systemctl stop pmon
debug "Stopped pmon service"
fi
if [[ x$sonic_asic_platform == x"mellanox" ]]; then
# In case of SAI failure the last line of sairedis.rec would contain switch_shutdown_request
if tail -1 /var/log/swss/sairedis.rec | grep -q switch_shutdown_request; then
collect_mst
fi
fi
if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then
debug "${TYPE} shutdown syncd process ..."
/usr/bin/docker exec -i syncd$DEV /usr/bin/syncd_request_shutdown --${TYPE}