From 74e28ad6ffd7f4e981e88dcc110506e83040bceb Mon Sep 17 00:00:00 2001 From: Santhosh Kumar T <53558409+santhosh-kt@users.noreply.github.com> Date: Thu, 6 Feb 2020 18:53:27 +0530 Subject: [PATCH] [DellEMC] S6000 - Thermal support - Last Reboot Reason (#4097) - Added support for Thermal event in Last Reboot Reason "show reboot-cause" command. - Added support for sending log message in case of thermal shutdown. sonic NOTICE root: Shutting down due to over temperature (40 degree, 30 degree, 34 degree) --- .../s6000/scripts/fancontrol.sh | 144 +++++++++++++----- .../s6000/sonic_platform/chassis.py | 34 +++++ 2 files changed, 138 insertions(+), 40 deletions(-) diff --git a/platform/broadcom/sonic-platform-modules-dell/s6000/scripts/fancontrol.sh b/platform/broadcom/sonic-platform-modules-dell/s6000/scripts/fancontrol.sh index 43315e06c9..665d5494b6 100755 --- a/platform/broadcom/sonic-platform-modules-dell/s6000/scripts/fancontrol.sh +++ b/platform/broadcom/sonic-platform-modules-dell/s6000/scripts/fancontrol.sh @@ -1,4 +1,5 @@ #!/bin/bash +trap 'cleanup' 0 1 2 3 6 9 10 11 13 15 LEVEL=99 INTERVAL=5 @@ -14,11 +15,27 @@ LEVEL3=16000 LEVEL4=19000 LEVEL5=19000 +LRR_FILE="/host/reboot-cause/reboot-cause.txt" I2C_ADAPTER="/sys/class/i2c-adapter/i2c-2/i2c-11" + SENSOR1="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_input" SENSOR2="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_input" SENSOR3="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_input" +SENSOR1_MAX="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_max" +SENSOR2_MAX="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_max" +SENSOR3_MAX="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_max" + +SENSOR1_MAX_VAL=$(cat $SENSOR1_MAX) +SENSOR2_MAX_VAL=$(cat $SENSOR2_MAX) +SENSOR3_MAX_VAL=$(cat $SENSOR3_MAX) + +# Reducing by 63 to differentiate this temperature settings +# from pmon sensors configuration settings +SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_MAX_VAL` + 5000 - 63) +SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_MAX_VAL` + 5000 - 63) +SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_MAX_VAL` + 5000 - 63) + # Three fan trays with each contains two separate fans # fan1-fan4 fan2-fan5 fan3-fan6 FANTRAY1_FAN1=$I2C_ADAPTER/11-0029/fan1_target @@ -46,6 +63,14 @@ function check_module fi } +function cleanup +{ + echo $SENSOR1_MAX_VAL > $SENSOR1_MAX + echo $SENSOR2_MAX_VAL > $SENSOR2_MAX + echo $SENSOR3_MAX_VAL > $SENSOR3_MAX + exit 1 +} + function check_faulty_fan { @@ -123,56 +148,95 @@ function update_fan_speed function monitor_temp_sensors { + SENSOR1_CUR_MAX_VAL=$(cat $SENSOR1_MAX) + SENSOR2_CUR_MAX_VAL=$(cat $SENSOR2_MAX) + SENSOR3_CUR_MAX_VAL=$(cat $SENSOR3_MAX) + if [ "$SENSOR1_CUR_MAX_VAL" -ne "$SENSOR1_NEW_MAX" ] + then + SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_CUR_MAX_VAL` + 5000 - 63) + SENSOR1_MAX_VAL=$SENSOR1_CUR_MAX_VAL + echo $SENSOR1_NEW_MAX > $SENSOR1_MAX + fi + if [ "$SENSOR2_CUR_MAX_VAL" -ne "$SENSOR2_NEW_MAX" ] + then + SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_CUR_MAX_VAL` + 5000 - 63) + SENSOR2_MAX_VAL=$SENSOR2_CUR_MAX_VAL + echo $SENSOR2_NEW_MAX > $SENSOR2_MAX + fi + if [ "$SENSOR3_CUR_MAX_VAL" -ne "$SENSOR3_NEW_MAX" ] + then + SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_CUR_MAX_VAL` + 5000 - 63) + SENSOR3_MAX_VAL=$SENSOR3_CUR_MAX_VAL + echo $SENSOR3_NEW_MAX > $SENSOR3_MAX + fi - while true # go through all temp sensor outputs - do - sensor1=$(expr `echo $(cat $SENSOR1)` / 1000) - sensor2=$(expr `echo $(cat $SENSOR2)` / 1000) - sensor3=$(expr `echo $(cat $SENSOR3)` / 1000) - sum=$(($sensor1 + $sensor2 + $sensor3)) - sensor_temp=$(($sum/3)) + # go through all temp sensor outputs + sensor1=$(expr `echo $(cat $SENSOR1)` / 1000) + sensor2=$(expr `echo $(cat $SENSOR2)` / 1000) + sensor3=$(expr `echo $(cat $SENSOR3)` / 1000) + # All sensors output in 1000's + s1=$(cat $SENSOR1) + s2=$(cat $SENSOR2) + s3=$(cat $SENSOR3) - if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ] - then - # Set Fan Speed to 7000 RPM" - LEVEL=0 - update_fan_speed $IDLE - logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + if [ "$s1" -ge "$SENSOR1_MAX_VAL" ] || [ "$s2" -ge "$SENSOR2_MAX_VAL" ] || [ "$s3" -ge "$SENSOR3_MAX_VAL" ] + then + # Thermal trip is about to happen + echo "Thermal Overload $sensor1 $sensor2 $sensor3" > $LRR_FILE + logger "Shutting down due to over temperature ($sensor1 degree, $sensor2 degree, $sensor3 degree)" + sync + sleep 1 # Give time to send logger message to server + # Assigning the original max values back in sensors + echo $SENSOR1_MAX_VAL > $SENSOR1_MAX + echo $SENSOR2_MAX_VAL > $SENSOR2_MAX + echo $SENSOR3_MAX_VAL > $SENSOR3_MAX - elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ] - then - # Set Fan Speed to 10000 RPM" - LEVEL=1 - update_fan_speed $LEVEL1 - logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + ### Not Reached ### + # In case if HW fails to shutdown + /sbin/shutdown -P now + fi + sum=$(($sensor1 + $sensor2 + $sensor3)) + sensor_temp=$(($sum/3)) - elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ] - then - # Set Fan Speed to 13000 RPM" - LEVEL=2 - update_fan_speed $LEVEL2 - logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ] + then + # Set Fan Speed to 7000 RPM" + LEVEL=0 + update_fan_speed $IDLE + logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" - elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ] - then - # Set Fan Speed to 16000 RPM" - LEVEL=3 - update_fan_speed $LEVEL3 - logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ] + then + # Set Fan Speed to 10000 RPM" + LEVEL=1 + update_fan_speed $LEVEL1 + logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" - elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ] - then - # Set Fan Speed to 19000 RPM" - LEVEL=4 - update_fan_speed $LEVEL4 - logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" - fi + elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ] + then + # Set Fan Speed to 13000 RPM" + LEVEL=2 + update_fan_speed $LEVEL2 + logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + + elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ] + then + # Set Fan Speed to 16000 RPM" + LEVEL=3 + update_fan_speed $LEVEL3 + logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + + elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ] + then + # Set Fan Speed to 19000 RPM" + LEVEL=4 + update_fan_speed $LEVEL4 + logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature" + fi # Check for faulty fan check_faulty_fan - done - } # Check drivers for sysfs attributes diff --git a/platform/broadcom/sonic-platform-modules-dell/s6000/sonic_platform/chassis.py b/platform/broadcom/sonic-platform-modules-dell/s6000/sonic_platform/chassis.py index e94a7d1210..8bb95cac1d 100755 --- a/platform/broadcom/sonic-platform-modules-dell/s6000/sonic_platform/chassis.py +++ b/platform/broadcom/sonic-platform-modules-dell/s6000/sonic_platform/chassis.py @@ -11,6 +11,7 @@ try: import os import time import datetime + import struct import subprocess from sonic_platform_base.chassis_base import ChassisBase from sonic_platform.sfp import Sfp @@ -41,6 +42,7 @@ class Chassis(ChassisBase): reset_reason_dict = {} reset_reason_dict[0xe] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE reset_reason_dict[0x6] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE + reset_reason_dict[0x7] = ChassisBase.REBOOT_CAUSE_THERMAL_OVERLOAD_OTHER def __init__(self): ChassisBase.__init__(self) @@ -100,6 +102,36 @@ class Chassis(ChassisBase): rv = rv.lstrip(" ") return rv + def _nvram_write(self, offset, val): + resource = "/dev/nvram" + fd = os.open(resource, os.O_RDWR) + if (fd < 0): + print('File open failed ',resource) + return + if (os.lseek(fd, offset, os.SEEK_SET) != offset): + print('lseek failed on ',resource) + return + ret = os.write(fd, struct.pack('B', val)) + if ret != 1: + print('Write failed ',str(ret)) + return + os.close(fd) + + def _get_thermal_reset(self): + reset_file = "/host/reboot-cause/reboot-cause.txt" + if (not os.path.isfile(reset_file)): + return False + try: + with open(reset_file, 'r') as fd: + rv = fd.read() + except Exception as error: + return False + + if "Thermal Overload" in rv: + return True + + return False + def get_name(self): """ Retrieves the name of the chassis @@ -181,6 +213,8 @@ class Chassis(ChassisBase): # NVRAM. Only Warmboot and Coldboot reason are supported here. # Since it does not support any hardware reason, we return # non_hardware as default + if self._get_thermal_reset() == True: + self._nvram_write(0x49, 0x7) lrr = self._get_cpld_register('last_reboot_reason') if (lrr != 'ERR'):