DellEMC S6100 SSD Monitor Porting Changes in master (#7289)
Why I did it To monitor the SSD health condition in DellEMC S6100 platform post upgrade. A daemon is introduced to monitor the SSD every one hour. To check for SSD status at boot time and at the time of cold-reboot. All these changes are supported only for newer SSD firmware. Porting changes from 201911 branch Added a platform_reboot_pre_check script to prevent cold-reboot based on SSD status. Depends on Azure/sonic-utilities#1556 DO NOT MERGE UNTIL ABOVE PR IS MERGED
This commit is contained in:
parent
d53c6248e4
commit
803aa389b0
@ -10,20 +10,28 @@ s6100/scripts/platform_reboot_override usr/share/sonic/device/x86_64-dell_s6100_
|
||||
s6100/scripts/fast-reboot_plugin usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/track_reboot_reason.sh usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/warm-reboot_plugin usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/soft-reboot_plugin usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/reboot_plugin usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/ssd-fw-upgrade usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/override.conf /etc/systemd/system/systemd-reboot.service.d
|
||||
common/dell_lpc_mon.sh usr/local/bin
|
||||
s6100/scripts/s6100_ssd_mon.sh usr/local/bin
|
||||
s6100/scripts/s6100_ssd_upgrade_status.sh usr/local/bin
|
||||
common/actions.sh usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/platform_sensors.py usr/local/bin
|
||||
s6100/scripts/platform_reboot_pre_check usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/hw-management-generate-dump.sh usr/bin
|
||||
s6100/modules/sonic_platform-1.0-py2-none-any.whl usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/modules/sonic_platform-1.0-py3-none-any.whl usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
|
||||
s6100/scripts/platform_watchdog_enable.sh usr/local/bin
|
||||
s6100/scripts/platform_watchdog_disable.sh usr/local/bin
|
||||
s6100/scripts/sensors usr/bin
|
||||
s6100/scripts/iSMART_64 usr/local/bin
|
||||
s6100/systemd/platform-modules-s6100.service etc/systemd/system
|
||||
s6100/systemd/s6100-lpc-monitor.service etc/systemd/system
|
||||
s6100/systemd/s6100-ssd-monitor.service etc/systemd/system
|
||||
s6100/systemd/s6100-ssd-monitor.timer etc/systemd/system
|
||||
s6100/systemd/s6100-ssd-upgrade-status.service etc/systemd/system
|
||||
s6100/systemd/s6100-reboot-cause.service etc/systemd/system
|
||||
s6100/systemd/s6100-i2c-enumerate.service etc/systemd/system
|
||||
s6100/scripts/s6100_serial_getty_monitor etc/monit/conf.d
|
||||
|
BIN
platform/broadcom/sonic-platform-modules-dell/s6100/scripts/iSMART_64
Executable file
BIN
platform/broadcom/sonic-platform-modules-dell/s6100/scripts/iSMART_64
Executable file
Binary file not shown.
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
SSD_FW_UPGRADE="/host/ssd_fw_upgrade"
|
||||
|
||||
_error_msg(){
|
||||
echo "The SSD on this unit is $1. Do not power-cycle/reboot this unit."
|
||||
echo "soft-/fast-/warm-reboot is allowed."
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is $1. Do not power-cycle/reboot this unit."
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "soft-/fast-/warm-reboot is allowed."
|
||||
}
|
||||
|
||||
# Check SSD Status
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_pending_upgrade ]; then
|
||||
_error_msg "running older firmware"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_low ] || [ -e $SSD_FW_UPGRADE/GPIO7_error ]; then
|
||||
_error_msg "faulty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
|
||||
iSMART="/usr/local/bin/iSMART_64"
|
||||
iSMART_OPTIONS="-d /dev/sda"
|
||||
|
||||
iSMART_CMD=`$iSMART $iSMART_OPTIONS`
|
||||
|
||||
GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')
|
||||
|
||||
if [ $GPIO_STATUS == "0x01" ];then
|
||||
exit 0
|
||||
else
|
||||
_error_msg "faulty"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
exit 1
|
@ -43,6 +43,8 @@ if [[ "$1" == "init" ]]; then
|
||||
/usr/local/bin/platform_watchdog_disable.sh
|
||||
fi
|
||||
|
||||
systemctl start --no-block s6100-ssd-upgrade-status.service
|
||||
|
||||
is_fast_warm=$(cat /proc/cmdline | grep SONIC_BOOT_TYPE | wc -l)
|
||||
|
||||
if [[ "$is_fast_warm" == "1" ]]; then
|
||||
|
21
platform/broadcom/sonic-platform-modules-dell/s6100/scripts/s6100_ssd_mon.sh
Executable file
21
platform/broadcom/sonic-platform-modules-dell/s6100/scripts/s6100_ssd_mon.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
SSD_FW_UPGRADE="/host/ssd_fw_upgrade"
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
|
||||
iSMART="/usr/local/bin/iSMART_64"
|
||||
iSMART_OPTIONS="-d /dev/sda"
|
||||
|
||||
iSMART_CMD=`$iSMART $iSMART_OPTIONS`
|
||||
GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')
|
||||
|
||||
if [ $GPIO_STATUS != "0x01" ];then
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty. Do not power-cycle/reboot this unit!"
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "soft-/fast-/warm-reboot is allowed."
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_low
|
||||
systemctl stop s6100-ssd-monitor.timer
|
||||
fi
|
||||
else
|
||||
systemctl stop s6100-ssd-monitor.timer
|
||||
fi
|
@ -0,0 +1,118 @@
|
||||
#!/bin/bash
|
||||
|
||||
SSD_FW_UPGRADE="/host/ssd_fw_upgrade"
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
|
||||
systemctl start --no-block s6100-ssd-monitor.timer
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_low ] || [ -e $SSD_FW_UPGRADE/GPIO7_error ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
[ ! -d $SSD_FW_UPGRADE ] && mkdir $SSD_FW_UPGRADE
|
||||
|
||||
SSD_UPGRADE_LOG="$SSD_FW_UPGRADE/upgrade.log"
|
||||
|
||||
SMART_CMD=`smartctl -a /dev/sda`
|
||||
|
||||
SSD_FW_VERSION=$(echo "$SMART_CMD" | grep "Firmware Version" | awk '{print $NF}')
|
||||
SSD_FW_VERSION=${SSD_FW_VERSION,,}
|
||||
SSD_MODEL=$(echo "$SMART_CMD" | grep "Device Model" | awk '{print $NF}')
|
||||
|
||||
if [ -e $SSD_FW_UPGRADE/GPIO7_pending_upgrade ]; then
|
||||
if [ $SSD_MODEL == "3IE" ] && [ $SSD_FW_VERSION == "s141002c" ]; then
|
||||
# If SSD Firmware is not upgraded
|
||||
exit 0
|
||||
fi
|
||||
if [ $SSD_FW_VERSION == "s16425c1" ] || [ $SSD_FW_VERSION == "s16425cq" ]; then
|
||||
# If SSD Firmware is not upgraded
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "$0 `date` SSD FW upgrade logs post reboot." >> $SSD_UPGRADE_LOG
|
||||
|
||||
iSMART="/usr/local/bin/iSMART_64"
|
||||
iSMART_OPTIONS="-d /dev/sda"
|
||||
iSMART_CMD=`$iSMART $iSMART_OPTIONS`
|
||||
|
||||
SSD_UPGRADE_STATUS1=`io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 09 --offset 211; io_rd_wr.py --get --offset 212`
|
||||
SSD_UPGRADE_STATUS1=$(echo "$SSD_UPGRADE_STATUS1" | awk '{print $NF}')
|
||||
|
||||
SSD_UPGRADE_STATUS2=`io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 0A --offset 211; io_rd_wr.py --get --offset 212`
|
||||
SSD_UPGRADE_STATUS2=$(echo "$SSD_UPGRADE_STATUS2" | awk '{print $NF}')
|
||||
|
||||
if [ $SSD_UPGRADE_STATUS1 == "2" ]; then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_error
|
||||
|
||||
echo "$0 `date` Upgraded to unknown version after first mp_64 upgrade." >> $SSD_UPGRADE_LOG
|
||||
|
||||
elif [ $SSD_MODEL == "3IE3" ] && [ $SSD_UPGRADE_STATUS2 == "2" ];then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_error
|
||||
|
||||
echo "$0 `date` Upgraded to unknown version after second mp_64 upgrade." >> $SSD_UPGRADE_LOG
|
||||
|
||||
elif [ $SSD_FW_VERSION == "s210506g" ] || [ $SSD_FW_VERSION == "s16425cg" ]; then
|
||||
# If SSD Firmware is upgraded
|
||||
GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')
|
||||
|
||||
if [ $GPIO_STATUS != "0x01" ];then
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty. Do not power-cycle/reboot this unit!"
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "soft-/fast-/warm-reboot is allowed."
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_low
|
||||
echo "$0 `date` The SSD on this unit is faulty. Do not power-cycle/reboot this unit!" >> $SSD_UPGRADE_LOG
|
||||
echo "$0 `date` soft-/fast-/warm-reboot is allowed." >> $SSD_UPGRADE_LOG
|
||||
|
||||
else
|
||||
if [ $SSD_UPGRADE_STATUS1 == "0" ]; then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_high
|
||||
systemctl start --no-block s6100-ssd-monitor.timer
|
||||
|
||||
if [ $SSD_MODEL == "3IE" ];then
|
||||
echo "$0 `date` SSD FW upgraded from S141002C to S210506G in first mp_64." >> $SSD_UPGRADE_LOG
|
||||
else
|
||||
echo "$0 `date` SSD FW upgraded from S16425c1 to S16425cG in first mp_64." >> $SSD_UPGRADE_LOG
|
||||
fi
|
||||
elif [ $SSD_MODEL == "3IE3" ] && [ $SSD_UPGRADE_STATUS2 == "1" ]; then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_low
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty. Do not power-cycle/reboot this unit!"
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "soft-/fast-/warm-reboot is allowed."
|
||||
|
||||
echo "$0 `date` SSD entered loader mode in first mp_64 and upgraded to latest version after second mp_64." >> $SSD_UPGRADE_LOG
|
||||
fi
|
||||
fi
|
||||
|
||||
else
|
||||
if [ $SSD_UPGRADE_STATUS1 == "ff" ] && [ $SSD_UPGRADE_STATUS2 == "ff" ]; then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_pending_upgrade
|
||||
|
||||
echo "$0 `date` SSD upgrade didn’t happen." >> $SSD_UPGRADE_LOG
|
||||
|
||||
elif [ $SSD_UPGRADE_STATUS1 == "1" ]; then
|
||||
rm -rf $SSD_FW_UPGRADE/GPIO7_*
|
||||
touch $SSD_FW_UPGRADE/GPIO7_low
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty. Do not power-cycle/reboot this unit!"
|
||||
logger -p user.crit -t DELL_S6100_SSD_MON "soft-/fast-/warm-reboot is allowed."
|
||||
|
||||
echo "$0 `date` SSD entered loader mode in first mp_64 upgrade." >> $SSD_UPGRADE_LOG
|
||||
|
||||
if [ $SSD_MODEL == "3IE3" ] && [ $SSD_UPGRADE_STATUS2 == "0" ]; then
|
||||
echo "$0 `date` SSD entered loader mode in first mp_64 and recovered back to older version in second mp_64." >> $SSD_UPGRADE_LOG
|
||||
fi
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
echo "$0 `date` SMF Register 1 = $SSD_UPGRADE_STATUS1" >> $SSD_UPGRADE_LOG
|
||||
echo "$0 `date` SMF Register 2 = $SSD_UPGRADE_STATUS2" >> $SSD_UPGRADE_LOG
|
||||
echo "$SMART_CMD" >> $SSD_UPGRADE_LOG
|
||||
echo "$iSMART_CMD" >> $SSD_UPGRADE_LOG
|
||||
sync
|
@ -0,0 +1 @@
|
||||
fast-reboot_plugin
|
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Dell S6100 SSD monitoring poller
|
||||
DefaultDependencies=no
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
ExecStart=/usr/local/bin/s6100_ssd_mon.sh
|
||||
RemainAfterExit=no
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Dell S6100 SSD monitoring poller timer
|
||||
DefaultDependencies=no
|
||||
After=pmon.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=60min
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description= Checking Dell S6100 SSD upgrade status
|
||||
After=pmon.service
|
||||
DefaultDependencies=no
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/s6100_ssd_upgrade_status.sh
|
||||
RemainAfterExit=no
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
Loading…
Reference in New Issue
Block a user