[DellEMC] S6000 - Thermal support - Last Reboot Reason (#4097)
- Added support for Thermal event in Last Reboot Reason "show reboot-cause" command. - Added support for sending log message in case of thermal shutdown. sonic NOTICE root: Shutting down due to over temperature (40 degree, 30 degree, 34 degree)
This commit is contained in:
parent
2e0f9ca0f4
commit
74e28ad6ff
@ -1,4 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
trap 'cleanup' 0 1 2 3 6 9 10 11 13 15
|
||||||
|
|
||||||
LEVEL=99
|
LEVEL=99
|
||||||
INTERVAL=5
|
INTERVAL=5
|
||||||
@ -14,11 +15,27 @@ LEVEL3=16000
|
|||||||
LEVEL4=19000
|
LEVEL4=19000
|
||||||
LEVEL5=19000
|
LEVEL5=19000
|
||||||
|
|
||||||
|
LRR_FILE="/host/reboot-cause/reboot-cause.txt"
|
||||||
I2C_ADAPTER="/sys/class/i2c-adapter/i2c-2/i2c-11"
|
I2C_ADAPTER="/sys/class/i2c-adapter/i2c-2/i2c-11"
|
||||||
|
|
||||||
SENSOR1="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_input"
|
SENSOR1="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_input"
|
||||||
SENSOR2="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_input"
|
SENSOR2="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_input"
|
||||||
SENSOR3="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_input"
|
SENSOR3="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_input"
|
||||||
|
|
||||||
|
SENSOR1_MAX="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_max"
|
||||||
|
SENSOR2_MAX="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_max"
|
||||||
|
SENSOR3_MAX="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_max"
|
||||||
|
|
||||||
|
SENSOR1_MAX_VAL=$(cat $SENSOR1_MAX)
|
||||||
|
SENSOR2_MAX_VAL=$(cat $SENSOR2_MAX)
|
||||||
|
SENSOR3_MAX_VAL=$(cat $SENSOR3_MAX)
|
||||||
|
|
||||||
|
# Reducing by 63 to differentiate this temperature settings
|
||||||
|
# from pmon sensors configuration settings
|
||||||
|
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_MAX_VAL` + 5000 - 63)
|
||||||
|
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_MAX_VAL` + 5000 - 63)
|
||||||
|
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_MAX_VAL` + 5000 - 63)
|
||||||
|
|
||||||
# Three fan trays with each contains two separate fans
|
# Three fan trays with each contains two separate fans
|
||||||
# fan1-fan4 fan2-fan5 fan3-fan6
|
# fan1-fan4 fan2-fan5 fan3-fan6
|
||||||
FANTRAY1_FAN1=$I2C_ADAPTER/11-0029/fan1_target
|
FANTRAY1_FAN1=$I2C_ADAPTER/11-0029/fan1_target
|
||||||
@ -46,6 +63,14 @@ function check_module
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
|
||||||
|
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
|
||||||
|
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
function check_faulty_fan
|
function check_faulty_fan
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -123,56 +148,95 @@ function update_fan_speed
|
|||||||
|
|
||||||
function monitor_temp_sensors
|
function monitor_temp_sensors
|
||||||
{
|
{
|
||||||
|
SENSOR1_CUR_MAX_VAL=$(cat $SENSOR1_MAX)
|
||||||
|
SENSOR2_CUR_MAX_VAL=$(cat $SENSOR2_MAX)
|
||||||
|
SENSOR3_CUR_MAX_VAL=$(cat $SENSOR3_MAX)
|
||||||
|
if [ "$SENSOR1_CUR_MAX_VAL" -ne "$SENSOR1_NEW_MAX" ]
|
||||||
|
then
|
||||||
|
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_CUR_MAX_VAL` + 5000 - 63)
|
||||||
|
SENSOR1_MAX_VAL=$SENSOR1_CUR_MAX_VAL
|
||||||
|
echo $SENSOR1_NEW_MAX > $SENSOR1_MAX
|
||||||
|
fi
|
||||||
|
if [ "$SENSOR2_CUR_MAX_VAL" -ne "$SENSOR2_NEW_MAX" ]
|
||||||
|
then
|
||||||
|
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_CUR_MAX_VAL` + 5000 - 63)
|
||||||
|
SENSOR2_MAX_VAL=$SENSOR2_CUR_MAX_VAL
|
||||||
|
echo $SENSOR2_NEW_MAX > $SENSOR2_MAX
|
||||||
|
fi
|
||||||
|
if [ "$SENSOR3_CUR_MAX_VAL" -ne "$SENSOR3_NEW_MAX" ]
|
||||||
|
then
|
||||||
|
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_CUR_MAX_VAL` + 5000 - 63)
|
||||||
|
SENSOR3_MAX_VAL=$SENSOR3_CUR_MAX_VAL
|
||||||
|
echo $SENSOR3_NEW_MAX > $SENSOR3_MAX
|
||||||
|
fi
|
||||||
|
|
||||||
while true # go through all temp sensor outputs
|
# go through all temp sensor outputs
|
||||||
do
|
sensor1=$(expr `echo $(cat $SENSOR1)` / 1000)
|
||||||
sensor1=$(expr `echo $(cat $SENSOR1)` / 1000)
|
sensor2=$(expr `echo $(cat $SENSOR2)` / 1000)
|
||||||
sensor2=$(expr `echo $(cat $SENSOR2)` / 1000)
|
sensor3=$(expr `echo $(cat $SENSOR3)` / 1000)
|
||||||
sensor3=$(expr `echo $(cat $SENSOR3)` / 1000)
|
# All sensors output in 1000's
|
||||||
sum=$(($sensor1 + $sensor2 + $sensor3))
|
s1=$(cat $SENSOR1)
|
||||||
sensor_temp=$(($sum/3))
|
s2=$(cat $SENSOR2)
|
||||||
|
s3=$(cat $SENSOR3)
|
||||||
|
|
||||||
if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ]
|
if [ "$s1" -ge "$SENSOR1_MAX_VAL" ] || [ "$s2" -ge "$SENSOR2_MAX_VAL" ] || [ "$s3" -ge "$SENSOR3_MAX_VAL" ]
|
||||||
then
|
then
|
||||||
# Set Fan Speed to 7000 RPM"
|
# Thermal trip is about to happen
|
||||||
LEVEL=0
|
echo "Thermal Overload $sensor1 $sensor2 $sensor3" > $LRR_FILE
|
||||||
update_fan_speed $IDLE
|
logger "Shutting down due to over temperature ($sensor1 degree, $sensor2 degree, $sensor3 degree)"
|
||||||
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
sync
|
||||||
|
sleep 1 # Give time to send logger message to server
|
||||||
|
# Assigning the original max values back in sensors
|
||||||
|
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
|
||||||
|
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
|
||||||
|
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
|
||||||
|
|
||||||
elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ]
|
### Not Reached ###
|
||||||
then
|
# In case if HW fails to shutdown
|
||||||
# Set Fan Speed to 10000 RPM"
|
/sbin/shutdown -P now
|
||||||
LEVEL=1
|
fi
|
||||||
update_fan_speed $LEVEL1
|
sum=$(($sensor1 + $sensor2 + $sensor3))
|
||||||
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
sensor_temp=$(($sum/3))
|
||||||
|
|
||||||
elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ]
|
if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ]
|
||||||
then
|
then
|
||||||
# Set Fan Speed to 13000 RPM"
|
# Set Fan Speed to 7000 RPM"
|
||||||
LEVEL=2
|
LEVEL=0
|
||||||
update_fan_speed $LEVEL2
|
update_fan_speed $IDLE
|
||||||
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
||||||
|
|
||||||
elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ]
|
elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ]
|
||||||
then
|
then
|
||||||
# Set Fan Speed to 16000 RPM"
|
# Set Fan Speed to 10000 RPM"
|
||||||
LEVEL=3
|
LEVEL=1
|
||||||
update_fan_speed $LEVEL3
|
update_fan_speed $LEVEL1
|
||||||
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
||||||
|
|
||||||
elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ]
|
elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ]
|
||||||
then
|
then
|
||||||
# Set Fan Speed to 19000 RPM"
|
# Set Fan Speed to 13000 RPM"
|
||||||
LEVEL=4
|
LEVEL=2
|
||||||
update_fan_speed $LEVEL4
|
update_fan_speed $LEVEL2
|
||||||
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
||||||
fi
|
|
||||||
|
elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ]
|
||||||
|
then
|
||||||
|
# Set Fan Speed to 16000 RPM"
|
||||||
|
LEVEL=3
|
||||||
|
update_fan_speed $LEVEL3
|
||||||
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
||||||
|
|
||||||
|
elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ]
|
||||||
|
then
|
||||||
|
# Set Fan Speed to 19000 RPM"
|
||||||
|
LEVEL=4
|
||||||
|
update_fan_speed $LEVEL4
|
||||||
|
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
|
||||||
|
fi
|
||||||
|
|
||||||
# Check for faulty fan
|
# Check for faulty fan
|
||||||
check_faulty_fan
|
check_faulty_fan
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check drivers for sysfs attributes
|
# Check drivers for sysfs attributes
|
||||||
|
@ -11,6 +11,7 @@ try:
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
|
import struct
|
||||||
import subprocess
|
import subprocess
|
||||||
from sonic_platform_base.chassis_base import ChassisBase
|
from sonic_platform_base.chassis_base import ChassisBase
|
||||||
from sonic_platform.sfp import Sfp
|
from sonic_platform.sfp import Sfp
|
||||||
@ -41,6 +42,7 @@ class Chassis(ChassisBase):
|
|||||||
reset_reason_dict = {}
|
reset_reason_dict = {}
|
||||||
reset_reason_dict[0xe] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
|
reset_reason_dict[0xe] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
|
||||||
reset_reason_dict[0x6] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
|
reset_reason_dict[0x6] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
|
||||||
|
reset_reason_dict[0x7] = ChassisBase.REBOOT_CAUSE_THERMAL_OVERLOAD_OTHER
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
ChassisBase.__init__(self)
|
ChassisBase.__init__(self)
|
||||||
@ -100,6 +102,36 @@ class Chassis(ChassisBase):
|
|||||||
rv = rv.lstrip(" ")
|
rv = rv.lstrip(" ")
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
def _nvram_write(self, offset, val):
|
||||||
|
resource = "/dev/nvram"
|
||||||
|
fd = os.open(resource, os.O_RDWR)
|
||||||
|
if (fd < 0):
|
||||||
|
print('File open failed ',resource)
|
||||||
|
return
|
||||||
|
if (os.lseek(fd, offset, os.SEEK_SET) != offset):
|
||||||
|
print('lseek failed on ',resource)
|
||||||
|
return
|
||||||
|
ret = os.write(fd, struct.pack('B', val))
|
||||||
|
if ret != 1:
|
||||||
|
print('Write failed ',str(ret))
|
||||||
|
return
|
||||||
|
os.close(fd)
|
||||||
|
|
||||||
|
def _get_thermal_reset(self):
|
||||||
|
reset_file = "/host/reboot-cause/reboot-cause.txt"
|
||||||
|
if (not os.path.isfile(reset_file)):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
with open(reset_file, 'r') as fd:
|
||||||
|
rv = fd.read()
|
||||||
|
except Exception as error:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "Thermal Overload" in rv:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def get_name(self):
|
def get_name(self):
|
||||||
"""
|
"""
|
||||||
Retrieves the name of the chassis
|
Retrieves the name of the chassis
|
||||||
@ -181,6 +213,8 @@ class Chassis(ChassisBase):
|
|||||||
# NVRAM. Only Warmboot and Coldboot reason are supported here.
|
# NVRAM. Only Warmboot and Coldboot reason are supported here.
|
||||||
# Since it does not support any hardware reason, we return
|
# Since it does not support any hardware reason, we return
|
||||||
# non_hardware as default
|
# non_hardware as default
|
||||||
|
if self._get_thermal_reset() == True:
|
||||||
|
self._nvram_write(0x49, 0x7)
|
||||||
|
|
||||||
lrr = self._get_cpld_register('last_reboot_reason')
|
lrr = self._get_cpld_register('last_reboot_reason')
|
||||||
if (lrr != 'ERR'):
|
if (lrr != 'ERR'):
|
||||||
|
Loading…
Reference in New Issue
Block a user