From 7781399bb6358d54c69420a523c30aca2ebe4f4e Mon Sep 17 00:00:00 2001 From: Vivek Date: Mon, 29 Aug 2022 06:09:26 -0700 Subject: [PATCH] [201911][Mellanox] Collect MST dump before syncd restart on shutdown notification (#11742) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Why I did it Collecting MST dump before syncd restart on shutdown notification during a SAI failure Dump can be found under: root@sonic:/home/admin# ls -l /var/dump/mstdump/ total 10684 -rw-r--r-- 1 root root 5460332 Aug 15 18:41 mstdump_20220815_184143.tar.gz -rw-r--r-- 1 root root 5473253 Aug 15 21:46 mstdump_20220815_214642.tar.gz root@sonic:/home/admin# tar -xvzf /var/dump/mstdump/mstdump_20220815_214642.tar.gz ├── ir-gdb │ └── core └── mstdump ├── mstdump1 ├── mstdump2 ├── mstdump3 └── mststatus - How I did it Checked for shutdown notification log in sairedis and used it to determine whether the shutdown is normal or due to SAI failure - How to verify it Simulated a SAI failure event and verified it. Verified it also on different reboots and config reload scenarios the dump is not generated Signed-off-by: Vivek Reddy --- files/scripts/syncd.sh | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 87a39b71ae..4d08fd443a 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -136,6 +136,34 @@ wait() { /usr/bin/${SERVICE}.sh wait $DEV } +collect_mst() { + debug "Collecting MST dump before syncd restart" + TMPDIR=/tmp/mlnxdump + MSTDIR=$TMPDIR/mstdump + DUMPDIR=/var/dump/mstdump + mkdir -p $MSTDIR + if ! mst status -v &> $MSTDIR/mststatus; then + debug "mst status command returned error" + else + local mst_dump_filename="$MSTDIR/mstdump" + local max_dump_count="3" + for i in $(seq 1 $max_dump_count); do + if ! ${CMD_PREFIX}/usr/bin/mstdump /dev/mst/mt*pci_cr0 > "${mst_dump_filename}${i}"; then + debug "mstdump failed" + break + fi + done + fi + mkdir -p $DUMPDIR + TARFILE=mstdump_`date +%Y%m%d_%H%M%S`.tar + tar -C $TMPDIR -cf $DUMPDIR/$TARFILE . + gzip -f $DUMPDIR/$TARFILE + debug "MST dump created $DUMPDIR/$TARFILE.gz" + rm -rf $TMPDIR + # Maintaining the recent 3 files and removing the rest + ls -1td $DUMPDIR/* | tail -n +4 | xargs rm -rf +} + stop() { debug "Stopping ${SERVICE}$DEV service..." @@ -154,7 +182,12 @@ stop() { /bin/systemctl stop pmon debug "Stopped pmon service" fi - + if [[ x$sonic_asic_platform == x"mellanox" ]]; then + # In case of SAI failure the last line of sairedis.rec would contain switch_shutdown_request + if tail -1 /var/log/swss/sairedis.rec | grep -q switch_shutdown_request; then + collect_mst + fi + fi if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then debug "${TYPE} shutdown syncd process ..." /usr/bin/docker exec -i syncd$DEV /usr/bin/syncd_request_shutdown --${TYPE}