Skip to content

Commit

Permalink
Fix system-health hardware_checker to consume fan tolerance details (s…
Browse files Browse the repository at this point in the history
…onic-net#16689)

Why I did it

Fan tolerance checking is done through new APIs, is_under_speed and is_over_speed, which populate corresponding fields into the database. speed_tolerance is no longer used and was removed, but system-health was not updated and indicates failures:

ADO: 25279165

root@sonic/# show system-health summary
System status summary

  System status LED  red_blink
  Services:
    Status: OK
  Hardware:
    Status: Not OK
    Reasons: Failed to get speed tolerance for fantray5.fan1
	     Failed to get speed tolerance for fantray5.fan0
	     Failed to get speed tolerance for fantray4.fan1
	     Failed to get speed tolerance for fantray4.fan0
	     Failed to get speed tolerance for fantray3.fan1
	     Failed to get speed tolerance for fantray3.fan0
	     Failed to get speed tolerance for fantray2.fan1
	     Failed to get speed tolerance for fantray2.fan0
	     Failed to get speed tolerance for fantray1.fan1
	     Failed to get speed tolerance for fantray1.fan0
	     Failed to get speed tolerance for fantray0.fan1
	     Failed to get speed tolerance for fantray0.fan0
	     Failed to get speed tolerance for PSU1.fan0
	     Failed to get speed tolerance for PSU0.fan0

How I did it
Updated hardware_checker.py in system-health to consume new is_under_speed and is_over_speed database entries instead of speed_tolerance and hard-coded calculations.

How to verify it
root@sonic:/# show system-health summary
System status summary

  System status LED  green
  Services:
    Status: OK
  Hardware:
    Status: OK
  • Loading branch information
spilkey-cisco authored and mssonicbld committed Feb 2, 2024
1 parent e676ad1 commit 79d9cb4
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 19 deletions.
28 changes: 15 additions & 13 deletions src/system-health/health_checker/hardware_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,37 +102,39 @@ def _check_fan_status(self, config):
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
speed = data_dict.get('speed', None)
speed_target = data_dict.get('speed_target', None)
speed_tolerance = data_dict.get('speed_tolerance', None)
is_under_speed = data_dict.get('is_under_speed', None)
is_over_speed = data_dict.get('is_over_speed', None)
if not speed:
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
continue
elif not speed_target:
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
continue
elif not speed_tolerance:
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
elif is_under_speed is None:
self.set_object_not_ok('Fan', name, 'Failed to get under speed threshold check for {}'.format(name))
continue
elif is_over_speed is None:
self.set_object_not_ok('Fan', name, 'Failed to get over speed threshold check for {}'.format(name))
continue
else:
try:
speed = float(speed)
speed_target = float(speed_target)
speed_tolerance = float(speed_tolerance)
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
if speed < speed_min_th or speed > speed_max_th:
if 'true' in (is_under_speed.lower(), is_over_speed.lower()):
self.set_object_not_ok('Fan', name,
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
speed,
speed_min_th,
speed_max_th))
'{} speed is out of range, speed={}, target={}'.format(
name,
speed,
speed_target))
continue
except ValueError:
self.set_object_not_ok('Fan', name,
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
'Invalid fan speed data for {}, speed={}, target={}, is_under_speed={}, is_over_speed={}'.format(
name,
speed,
speed_target,
speed_tolerance))
is_under_speed,
is_over_speed))
continue

if not self._ignore_check(config.ignore_devices, 'fan', name, 'direction'):
Expand Down
28 changes: 22 additions & 6 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,36 +298,49 @@ def test_hardware_checker():
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20',
'is_under_speed': 'False',
'is_over_speed': 'False',
'direction': 'intake'
},
'FAN_INFO|fan2': {
'presence': 'False',
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'False',
'is_over_speed': 'False',
},
'FAN_INFO|fan3': {
'presence': 'True',
'status': 'False',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'False',
'is_over_speed': 'False',
},
'FAN_INFO|fan4': {
'presence': 'True',
'status': 'True',
'speed': '20',
'speed_target': '60',
'speed_tolerance': '20'
'is_under_speed': 'True',
'is_over_speed': 'False',
},
'FAN_INFO|fan5': {
'presence': 'True',
'status': 'True',
'speed': '90',
'speed_target': '60',
'is_under_speed': 'False',
'is_over_speed': 'True',
},
'FAN_INFO|fan6': {
'presence': 'True',
'status': 'True',
'speed': '60',
'speed_target': '60',
'speed_tolerance': '20',
'is_under_speed': 'False',
'is_over_speed': 'False',
'direction': 'exhaust'
}
})
Expand Down Expand Up @@ -426,7 +439,10 @@ def test_hardware_checker():

assert 'fan5' in checker._info
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan5 direction exhaust is not aligned with fan1 direction intake'

assert 'fan6' in checker._info
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan6 direction exhaust is not aligned with fan1 direction intake'

assert 'PSU 1' in checker._info
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
Expand Down

0 comments on commit 79d9cb4

Please sign in to comment.