74e28ad6ff
- Added support for Thermal event in Last Reboot Reason "show reboot-cause" command. - Added support for sending log message in case of thermal shutdown. sonic NOTICE root: Shutting down due to over temperature (40 degree, 30 degree, 34 degree)
254 lines
7.4 KiB
Bash
Executable File
254 lines
7.4 KiB
Bash
Executable File
#!/bin/bash
|
|
trap 'cleanup' 0 1 2 3 6 9 10 11 13 15
|
|
|
|
LEVEL=99
|
|
INTERVAL=5
|
|
FAULTY_FANTRAY1=1
|
|
FAULTY_FANTRAY2=1
|
|
FAULTY_FANTRAY3=1
|
|
|
|
# FAN RPM Speed
|
|
IDLE=7000
|
|
LEVEL1=10000
|
|
LEVEL2=13000
|
|
LEVEL3=16000
|
|
LEVEL4=19000
|
|
LEVEL5=19000
|
|
|
|
LRR_FILE="/host/reboot-cause/reboot-cause.txt"
|
|
I2C_ADAPTER="/sys/class/i2c-adapter/i2c-2/i2c-11"
|
|
|
|
SENSOR1="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_input"
|
|
SENSOR2="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_input"
|
|
SENSOR3="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_input"
|
|
|
|
SENSOR1_MAX="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_max"
|
|
SENSOR2_MAX="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_max"
|
|
SENSOR3_MAX="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_max"
|
|
|
|
SENSOR1_MAX_VAL=$(cat $SENSOR1_MAX)
|
|
SENSOR2_MAX_VAL=$(cat $SENSOR2_MAX)
|
|
SENSOR3_MAX_VAL=$(cat $SENSOR3_MAX)
|
|
|
|
# Reducing by 63 to differentiate this temperature settings
|
|
# from pmon sensors configuration settings
|
|
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_MAX_VAL` + 5000 - 63)
|
|
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_MAX_VAL` + 5000 - 63)
|
|
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_MAX_VAL` + 5000 - 63)
|
|
|
|
# Three fan trays with each contains two separate fans
|
|
# fan1-fan4 fan2-fan5 fan3-fan6
|
|
FANTRAY1_FAN1=$I2C_ADAPTER/11-0029/fan1_target
|
|
FANTRAY1_FAN2=$I2C_ADAPTER/11-0029/fan2_target
|
|
FANTRAY2_FAN1=$I2C_ADAPTER/11-0029/fan3_target
|
|
FANTRAY2_FAN2=$I2C_ADAPTER/11-0029/fan4_target
|
|
FANTRAY3_FAN1=$I2C_ADAPTER/11-002a/fan1_target
|
|
FANTRAY3_FAN2=$I2C_ADAPTER/11-002a/fan2_target
|
|
|
|
FANTRAY1_FAN1_RPM=$I2C_ADAPTER/11-0029/fan1_input
|
|
FANTRAY1_FAN2_RPM=$I2C_ADAPTER/11-0029/fan2_input
|
|
FANTRAY2_FAN1_RPM=$I2C_ADAPTER/11-0029/fan3_input
|
|
FANTRAY2_FAN2_RPM=$I2C_ADAPTER/11-0029/fan4_input
|
|
FANTRAY3_FAN1_RPM=$I2C_ADAPTER/11-002a/fan1_input
|
|
FANTRAY3_FAN2_RPM=$I2C_ADAPTER/11-002a/fan2_input
|
|
|
|
function check_module
|
|
{
|
|
MODULE=$1
|
|
lsmod | grep "$MODULE" > /dev/null
|
|
ret=$?
|
|
if [[ $ret = "1" ]]; then
|
|
echo "$MODULE is not loaded!"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
function cleanup
|
|
{
|
|
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
|
|
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
|
|
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
|
|
exit 1
|
|
}
|
|
|
|
function check_faulty_fan
|
|
{
|
|
|
|
# Assume fans in FanTray spins less than 1000 RPM is faulty.
|
|
# To Maintain temperature assign max speed 16200 RPM to all other fans.
|
|
# This RPM speed handle temperature upto 75C degrees
|
|
|
|
fan1=$(cat $FANTRAY1_FAN1_RPM)
|
|
fan2=$(cat $FANTRAY1_FAN2_RPM)
|
|
fan3=$(cat $FANTRAY2_FAN1_RPM)
|
|
fan4=$(cat $FANTRAY2_FAN2_RPM)
|
|
fan5=$(cat $FANTRAY3_FAN1_RPM)
|
|
fan6=$(cat $FANTRAY3_FAN2_RPM)
|
|
|
|
# FanTray1
|
|
if [ "$fan1" -le "1000" ] || [ "$fan2" -le "1000" ]; then
|
|
|
|
# First time detecting failure
|
|
if [ $FAULTY_FANTRAY1 -lt "2" ]; then
|
|
FAULTY_FANTRAY1=2
|
|
/usr/local/bin/set-fan-speed 16200 2 > /dev/null
|
|
logger "Faulty Fans in Fantray1 $fan1 $fan2 Please check."
|
|
fi
|
|
|
|
elif [ "$fan1" -ge "1000" ] || [ "$fan2" -ge "1000" ]; then
|
|
FAULTY_FANTRAY1=0
|
|
fi
|
|
|
|
|
|
# FanTray2
|
|
if [ "$fan3" -le "1000" ] || [ "$fan4" -le "1000" ]; then
|
|
|
|
# First time detecting failure
|
|
if [ $FAULTY_FANTRAY2 -lt "2" ]; then
|
|
|
|
FAULTY_FANTRAY2=2
|
|
/usr/local/bin/set-fan-speed 16200 2 > /dev/null
|
|
logger "Faulty Fans in FanTray2: $fan3 $fan4. Please check."
|
|
fi
|
|
|
|
elif [ "$fan3" -ge "1000" ] || [ "$fan4" -ge "1000" ]; then
|
|
FAULTY_FANTRAY2=0
|
|
fi
|
|
|
|
# FanTray3
|
|
if [ "$fan5" -le "1000" ] || [ "$fan6" -le "1000" ]; then
|
|
|
|
# First time detecting failure
|
|
if [ $FAULTY_FANTRAY3 -lt "2" ]; then
|
|
|
|
FAULTY_FANTRAY3=2
|
|
/usr/local/bin/set-fan-speed 16200 2 > /dev/null
|
|
logger "FanTray3 Fans are Faulty.. $fan5 $fan6. Please check."
|
|
fi
|
|
|
|
elif [ "$fan5" -ge "1000" ] || [ "$fan6" -ge "1000" ]; then
|
|
|
|
FAULTY_FANTRAY3=0
|
|
fi
|
|
|
|
}
|
|
|
|
function update_fan_speed
|
|
{
|
|
local fan_speed=$1
|
|
|
|
echo $fan_speed > $FANTRAY1_FAN1
|
|
echo $fan_speed > $FANTRAY1_FAN2
|
|
echo $fan_speed > $FANTRAY2_FAN1
|
|
echo $fan_speed > $FANTRAY2_FAN2
|
|
echo $fan_speed > $FANTRAY3_FAN1
|
|
echo $fan_speed > $FANTRAY3_FAN2
|
|
|
|
}
|
|
|
|
function monitor_temp_sensors
|
|
{
|
|
SENSOR1_CUR_MAX_VAL=$(cat $SENSOR1_MAX)
|
|
SENSOR2_CUR_MAX_VAL=$(cat $SENSOR2_MAX)
|
|
SENSOR3_CUR_MAX_VAL=$(cat $SENSOR3_MAX)
|
|
if [ "$SENSOR1_CUR_MAX_VAL" -ne "$SENSOR1_NEW_MAX" ]
|
|
then
|
|
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_CUR_MAX_VAL` + 5000 - 63)
|
|
SENSOR1_MAX_VAL=$SENSOR1_CUR_MAX_VAL
|
|
echo $SENSOR1_NEW_MAX > $SENSOR1_MAX
|
|
fi
|
|
if [ "$SENSOR2_CUR_MAX_VAL" -ne "$SENSOR2_NEW_MAX" ]
|
|
then
|
|
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_CUR_MAX_VAL` + 5000 - 63)
|
|
SENSOR2_MAX_VAL=$SENSOR2_CUR_MAX_VAL
|
|
echo $SENSOR2_NEW_MAX > $SENSOR2_MAX
|
|
fi
|
|
if [ "$SENSOR3_CUR_MAX_VAL" -ne "$SENSOR3_NEW_MAX" ]
|
|
then
|
|
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_CUR_MAX_VAL` + 5000 - 63)
|
|
SENSOR3_MAX_VAL=$SENSOR3_CUR_MAX_VAL
|
|
echo $SENSOR3_NEW_MAX > $SENSOR3_MAX
|
|
fi
|
|
|
|
# go through all temp sensor outputs
|
|
sensor1=$(expr `echo $(cat $SENSOR1)` / 1000)
|
|
sensor2=$(expr `echo $(cat $SENSOR2)` / 1000)
|
|
sensor3=$(expr `echo $(cat $SENSOR3)` / 1000)
|
|
# All sensors output in 1000's
|
|
s1=$(cat $SENSOR1)
|
|
s2=$(cat $SENSOR2)
|
|
s3=$(cat $SENSOR3)
|
|
|
|
if [ "$s1" -ge "$SENSOR1_MAX_VAL" ] || [ "$s2" -ge "$SENSOR2_MAX_VAL" ] || [ "$s3" -ge "$SENSOR3_MAX_VAL" ]
|
|
then
|
|
# Thermal trip is about to happen
|
|
echo "Thermal Overload $sensor1 $sensor2 $sensor3" > $LRR_FILE
|
|
logger "Shutting down due to over temperature ($sensor1 degree, $sensor2 degree, $sensor3 degree)"
|
|
sync
|
|
sleep 1 # Give time to send logger message to server
|
|
# Assigning the original max values back in sensors
|
|
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
|
|
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
|
|
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
|
|
|
|
### Not Reached ###
|
|
# In case if HW fails to shutdown
|
|
/sbin/shutdown -P now
|
|
fi
|
|
sum=$(($sensor1 + $sensor2 + $sensor3))
|
|
sensor_temp=$(($sum/3))
|
|
|
|
if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ]
|
|
then
|
|
# Set Fan Speed to 7000 RPM"
|
|
LEVEL=0
|
|
update_fan_speed $IDLE
|
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
|
|
|
elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ]
|
|
then
|
|
# Set Fan Speed to 10000 RPM"
|
|
LEVEL=1
|
|
update_fan_speed $LEVEL1
|
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
|
|
|
elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ]
|
|
then
|
|
# Set Fan Speed to 13000 RPM"
|
|
LEVEL=2
|
|
update_fan_speed $LEVEL2
|
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
|
|
|
elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ]
|
|
then
|
|
# Set Fan Speed to 16000 RPM"
|
|
LEVEL=3
|
|
update_fan_speed $LEVEL3
|
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
|
|
|
elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ]
|
|
then
|
|
# Set Fan Speed to 19000 RPM"
|
|
LEVEL=4
|
|
update_fan_speed $LEVEL4
|
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
|
fi
|
|
|
|
# Check for faulty fan
|
|
check_faulty_fan
|
|
|
|
}
|
|
|
|
# Check drivers for sysfs attributes
|
|
check_module "dell_s6000_platform"
|
|
check_module "max6620"
|
|
|
|
# main loop calling the main function at specified intervals
|
|
while true
|
|
do
|
|
monitor_temp_sensors
|
|
# Sleep while still handling signals
|
|
sleep $INTERVAL &
|
|
wait
|
|
done
|