Fix system-health hardware_checker to consume fan tolerance details (#16689)

Why I did it

Fan tolerance checking is done through new APIs, is_under_speed and is_over_speed, which populate corresponding fields into the database. speed_tolerance is no longer used and was removed, but system-health was not updated and indicates failures:

ADO: 25279165

root@sonic/# show system-health summary
System status summary

  System status LED  red_blink
  Services:
    Status: OK
  Hardware:
    Status: Not OK
    Reasons: Failed to get speed tolerance for fantray5.fan1
	     Failed to get speed tolerance for fantray5.fan0
	     Failed to get speed tolerance for fantray4.fan1
	     Failed to get speed tolerance for fantray4.fan0
	     Failed to get speed tolerance for fantray3.fan1
	     Failed to get speed tolerance for fantray3.fan0
	     Failed to get speed tolerance for fantray2.fan1
	     Failed to get speed tolerance for fantray2.fan0
	     Failed to get speed tolerance for fantray1.fan1
	     Failed to get speed tolerance for fantray1.fan0
	     Failed to get speed tolerance for fantray0.fan1
	     Failed to get speed tolerance for fantray0.fan0
	     Failed to get speed tolerance for PSU1.fan0
	     Failed to get speed tolerance for PSU0.fan0

How I did it
Updated hardware_checker.py in system-health to consume new is_under_speed and is_over_speed database entries instead of speed_tolerance and hard-coded calculations.

How to verify it
root@sonic:/# show system-health summary
System status summary

  System status LED  green
  Services:
    Status: OK
  Hardware:
    Status: OK
This commit is contained in:
spilkey-cisco 2023-12-15 15:33:20 -08:00 committed by GitHub
parent 979516633d
commit 69ad1ed41a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 19 deletions

View File

@ -102,37 +102,39 @@ class HardwareChecker(HealthChecker):
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
speed = data_dict.get('speed', None)
speed_target = data_dict.get('speed_target', None)
speed_tolerance = data_dict.get('speed_tolerance', None)
is_under_speed = data_dict.get('is_under_speed', None)
is_over_speed = data_dict.get('is_over_speed', None)
if not speed:
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
continue
elif not speed_target:
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
continue
elif not speed_tolerance:
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
elif is_under_speed is None:
self.set_object_not_ok('Fan', name, 'Failed to get under speed threshold check for {}'.format(name))
continue
elif is_over_speed is None:
self.set_object_not_ok('Fan', name, 'Failed to get over speed threshold check for {}'.format(name))
continue
else:
try:
speed = float(speed)
speed_target = float(speed_target)
speed_tolerance = float(speed_tolerance)
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
if speed < speed_min_th or speed > speed_max_th:
if 'true' in (is_under_speed.lower(), is_over_speed.lower()):
self.set_object_not_ok('Fan', name,
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
speed,
speed_min_th,
speed_max_th))
'{} speed is out of range, speed={}, target={}'.format(
name,
speed,
speed_target))
continue
except ValueError:
self.set_object_not_ok('Fan', name,
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
'Invalid fan speed data for {}, speed={}, target={}, is_under_speed={}, is_over_speed={}'.format(
name,
speed,
speed_target,
speed_tolerance))
is_under_speed,
is_over_speed))
continue
if not self._ignore_check(config.ignore_devices, 'fan', name, 'direction'):

View File

@ -298,7 +298,8 @@ def test_hardware_checker():
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20',
'is_under_speed': 'False',
'is_over_speed': 'False',
'direction': 'intake'
},
'FAN_INFO|fan2': {
@ -306,28 +307,40 @@ def test_hardware_checker():
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'False',
'is_over_speed': 'False',
},
'FAN_INFO|fan3': {
'presence': 'True',
'status': 'False',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'False',
'is_over_speed': 'False',
},
'FAN_INFO|fan4': {
'presence': 'True',
'status': 'True',
'speed': '20',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'True',
'is_over_speed': 'False',
},
'FAN_INFO|fan5': {
'presence': 'True',
'status': 'True',
'speed': '90',
'speed_target': '60',
'is_under_speed': 'False',
'is_over_speed': 'True',
},
'FAN_INFO|fan6': {
'presence': 'True',
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20',
'is_under_speed': 'False',
'is_over_speed': 'False',
'direction': 'exhaust'
}
})
@ -426,7 +439,10 @@ def test_hardware_checker():
assert 'fan5' in checker._info
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan5 direction exhaust is not aligned with fan1 direction intake'
assert 'fan6' in checker._info
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan6 direction exhaust is not aligned with fan1 direction intake'
assert 'PSU 1' in checker._info
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK