sonic-buildimage/files/scripts/syncd.sh
Michael Li f753a8ba04 Reload BCM SDK kmods on syncd start to handle syncd restart issues (#12804)
Why I did it
There is an issue on the Arista PikeZ platform (using T3.X2: BCM56274) while running SONiC. If the 'syncd' container in SONiC is restarted, the expected behaviour is that syncd will automatically restart/recover; however it does not and always fails at create_switch due to BCM SDK kmod DMA operation cancellation getting stuck.

Sep 16 22:19:44.855125 pkz208 ERR syncd#syncd: [none] SAI_API_SWITCH:platform_process_command:428 Platform command "init soc" failed, rc = -1. Sep 16 22:19:44.855206 pkz208 INFO syncd#supervisord: syncd CMIC_CMC0_PKTDMA_CH4_DESC_COUNT_REQ:0x33#015 Sep 16 22:19:44.855264 pkz208 CRIT syncd#syncd: [none] SAI_API_SWITCH:platformInit:1909 initialization command "init soc" failed, rc = -1 (Internal error). Sep 16 22:19:44.855403 pkz208 CRIT syncd#syncd: [none] SAI_API_SWITCH:sai_driver_init:642 Error initializing driver, rc = -1. ... Sep 16 22:19:44.855891 pkz208 CRIT syncd#syncd: [none] SAI_API_SWITCH:brcm_sai_create_switch:1173 initializing SDK failed with error Operation failed (0xfffffff5).

Reloading the BCM SDK kmods allows the switch init to continue properly.

How I did it
If BCM SDK kmods are loaded, unload and load them again on syncd docker start script.

How to verify it
Steps to reproduce:

In SONiC, run 'docker ps' to see current running containers; 'syncd' should be present.
Run 'docker stop syncd'
Wait ~1 minute.
Run 'docker ps' to see that syncd is missing.
Check logs to see messages similar to the above.

Signed-off-by: Michael Li <michael.li@broadcom.com>
2022-12-10 10:33:21 +08:00

160 lines
4.7 KiB
Bash
Executable File

#!/bin/bash
. /usr/local/bin/syncd_common.sh
function startplatform() {
# platform specific tasks
# start mellanox drivers regardless of
# boot type
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
BOOT_TYPE=`getBootType`
if [[ x"$WARM_BOOT" == x"true" || x"$BOOT_TYPE" == x"fast" ]]; then
export FAST_BOOT=1
fi
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [[ x"$(/bin/systemctl is-active pmon)" == x"active" ]]; then
/bin/systemctl stop pmon
debug "pmon is active while syncd starting, stop it first"
fi
fi
debug "Starting Firmware update procedure"
/usr/bin/mst start --with_i2cdev
/usr/bin/mlnx-fw-upgrade.sh
/etc/init.d/sxdkernel restart
debug "Firmware update procedure ended"
fi
if [[ x"$sonic_asic_platform" == x"broadcom" ]]; then
if [[ x"$WARM_BOOT" != x"true" ]]; then
is_bcm0=$(ls /sys/class/net | grep bcm0)
if [[ "$is_bcm0" == "bcm0" ]]; then
debug "stop SDK opennsl-modules ..."
/etc/init.d/opennsl-modules stop
debug "start SDK opennsl-modules ..."
/etc/init.d/opennsl-modules start
debug "started SDK opennsl-modules"
fi
fi
fi
if [[ x"$sonic_asic_platform" == x"barefoot" ]]; then
is_usb0=$(ls /sys/class/net | grep usb0)
if [[ "$is_usb0" == "usb0" ]]; then
/usr/bin/ip link set usb0 down
/usr/bin/ip link set usb0 up
fi
fi
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [ x$sonic_asic_platform == x'cavium' ]; then
/etc/init.d/xpnet.sh start
fi
fi
}
function waitplatform() {
BOOT_TYPE=`getBootType`
if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then
if [[ x"$BOOT_TYPE" = @(x"fast"|x"warm"|x"fastfast") ]]; then
PMON_TIMER_STATUS=$(systemctl is-active pmon.timer)
if [[ x"$PMON_TIMER_STATUS" = x"inactive" ]]; then
systemctl start pmon.timer
else
debug "PMON service is delayed by a timer for better fast/warm boot performance"
fi
else
debug "Starting pmon service..."
/bin/systemctl start pmon
debug "Started pmon service"
fi
fi
if [[ x"$BOOT_TYPE" = @(x"fast"|x"warm"|x"fastfast") ]]; then
debug "LLDP service is delayed by a timer for better fast/warm boot performance"
else
lldp_state=$(systemctl is-enabled lldp.timer)
if [[ $lldp_state == "enabled" ]]
then
debug "Starting lldp service..."
/bin/systemctl start lldp
debug "Started lldp service"
fi
fi
}
function stopplatform1() {
if [[ x$sonic_asic_platform == x"mellanox" ]] && [[ x$TYPE == x"cold" ]]; then
debug "Stopping pmon service ahead of syncd..."
/bin/systemctl stop pmon
debug "Stopped pmon service"
fi
if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then
debug "${TYPE} shutdown syncd process ..."
/usr/bin/docker exec -i syncd$DEV /usr/bin/syncd_request_shutdown --${TYPE}
# wait until syncd quits gracefully or force syncd to exit after
# waiting for 20 seconds
start_in_secs=${SECONDS}
end_in_secs=${SECONDS}
timer_threshold=20
while docker top syncd$DEV | grep -q /usr/bin/syncd \
&& [[ $((end_in_secs - start_in_secs)) -le $timer_threshold ]]; do
sleep 0.1
end_in_secs=${SECONDS}
done
if [[ $((end_in_secs - start_in_secs)) -gt $timer_threshold ]]; then
debug "syncd process in container syncd$DEV did not exit gracefully"
fi
/usr/bin/docker exec -i syncd$DEV /bin/sync
debug "Finished ${TYPE} shutdown syncd process ..."
fi
}
function stopplatform2() {
# platform specific tasks
if [[ x"$WARM_BOOT" != x"true" ]]; then
if [ x$sonic_asic_platform == x'mellanox' ]; then
/etc/init.d/sxdkernel stop
/usr/bin/mst stop
elif [ x$sonic_asic_platform == x'cavium' ]; then
/etc/init.d/xpnet.sh stop
/etc/init.d/xpnet.sh start
fi
fi
}
OP=$1
DEV=$2
SERVICE="syncd"
PEER="swss"
DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
LOCKFILE="/tmp/swss-syncd-lock$DEV"
NAMESPACE_PREFIX="asic"
if [ "$DEV" ]; then
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
else
NET_NS=""
SONIC_DB_CLI="sonic-db-cli"
fi
case "$1" in
start|wait|stop)
$1
;;
*)
echo "Usage: $0 {start|wait|stop}"
exit 1
;;
esac