Fix system-health hardware_checker to consume fan tolerance details (#16689)
Why I did it Fan tolerance checking is done through new APIs, is_under_speed and is_over_speed, which populate corresponding fields into the database. speed_tolerance is no longer used and was removed, but system-health was not updated and indicates failures: ADO: 25279165 root@sonic/# show system-health summary System status summary System status LED red_blink Services: Status: OK Hardware: Status: Not OK Reasons: Failed to get speed tolerance for fantray5.fan1 Failed to get speed tolerance for fantray5.fan0 Failed to get speed tolerance for fantray4.fan1 Failed to get speed tolerance for fantray4.fan0 Failed to get speed tolerance for fantray3.fan1 Failed to get speed tolerance for fantray3.fan0 Failed to get speed tolerance for fantray2.fan1 Failed to get speed tolerance for fantray2.fan0 Failed to get speed tolerance for fantray1.fan1 Failed to get speed tolerance for fantray1.fan0 Failed to get speed tolerance for fantray0.fan1 Failed to get speed tolerance for fantray0.fan0 Failed to get speed tolerance for PSU1.fan0 Failed to get speed tolerance for PSU0.fan0 How I did it Updated hardware_checker.py in system-health to consume new is_under_speed and is_over_speed database entries instead of speed_tolerance and hard-coded calculations. How to verify it root@sonic:/# show system-health summary System status summary System status LED green Services: Status: OK Hardware: Status: OK
This commit is contained in:
parent
93eaa3cac0
commit
3b982c073c
@ -102,37 +102,39 @@ class HardwareChecker(HealthChecker):
|
||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
|
||||
speed = data_dict.get('speed', None)
|
||||
speed_target = data_dict.get('speed_target', None)
|
||||
speed_tolerance = data_dict.get('speed_tolerance', None)
|
||||
is_under_speed = data_dict.get('is_under_speed', None)
|
||||
is_over_speed = data_dict.get('is_over_speed', None)
|
||||
if not speed:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
|
||||
continue
|
||||
elif not speed_target:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
|
||||
continue
|
||||
elif not speed_tolerance:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
|
||||
elif is_under_speed is None:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get under speed threshold check for {}'.format(name))
|
||||
continue
|
||||
elif is_over_speed is None:
|
||||
self.set_object_not_ok('Fan', name, 'Failed to get over speed threshold check for {}'.format(name))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
speed = float(speed)
|
||||
speed_target = float(speed_target)
|
||||
speed_tolerance = float(speed_tolerance)
|
||||
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
|
||||
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
|
||||
if speed < speed_min_th or speed > speed_max_th:
|
||||
if 'true' in (is_under_speed.lower(), is_over_speed.lower()):
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
|
||||
speed,
|
||||
speed_min_th,
|
||||
speed_max_th))
|
||||
'{} speed is out of range, speed={}, target={}'.format(
|
||||
name,
|
||||
speed,
|
||||
speed_target))
|
||||
continue
|
||||
except ValueError:
|
||||
self.set_object_not_ok('Fan', name,
|
||||
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
|
||||
'Invalid fan speed data for {}, speed={}, target={}, is_under_speed={}, is_over_speed={}'.format(
|
||||
name,
|
||||
speed,
|
||||
speed_target,
|
||||
speed_tolerance))
|
||||
is_under_speed,
|
||||
is_over_speed))
|
||||
continue
|
||||
|
||||
if not self._ignore_check(config.ignore_devices, 'fan', name, 'direction'):
|
||||
|
@ -298,7 +298,8 @@ def test_hardware_checker():
|
||||
'status': 'True',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20',
|
||||
'is_under_speed': 'False',
|
||||
'is_over_speed': 'False',
|
||||
'direction': 'intake'
|
||||
},
|
||||
'FAN_INFO|fan2': {
|
||||
@ -306,28 +307,40 @@ def test_hardware_checker():
|
||||
'status': 'True',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
'is_under_speed': 'False',
|
||||
'is_over_speed': 'False',
|
||||
},
|
||||
'FAN_INFO|fan3': {
|
||||
'presence': 'True',
|
||||
'status': 'False',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
'is_under_speed': 'False',
|
||||
'is_over_speed': 'False',
|
||||
},
|
||||
'FAN_INFO|fan4': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'speed': '20',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20'
|
||||
'is_under_speed': 'True',
|
||||
'is_over_speed': 'False',
|
||||
},
|
||||
'FAN_INFO|fan5': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'speed': '90',
|
||||
'speed_target': '60',
|
||||
'is_under_speed': 'False',
|
||||
'is_over_speed': 'True',
|
||||
},
|
||||
'FAN_INFO|fan6': {
|
||||
'presence': 'True',
|
||||
'status': 'True',
|
||||
'speed': '60',
|
||||
'speed_target': '60',
|
||||
'speed_tolerance': '20',
|
||||
'is_under_speed': 'False',
|
||||
'is_over_speed': 'False',
|
||||
'direction': 'exhaust'
|
||||
}
|
||||
})
|
||||
@ -426,7 +439,10 @@ def test_hardware_checker():
|
||||
|
||||
assert 'fan5' in checker._info
|
||||
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan5 direction exhaust is not aligned with fan1 direction intake'
|
||||
|
||||
assert 'fan6' in checker._info
|
||||
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
|
||||
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan6 direction exhaust is not aligned with fan1 direction intake'
|
||||
|
||||
assert 'PSU 1' in checker._info
|
||||
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
|
||||
|
Loading…
Reference in New Issue
Block a user