From 79d9cb44df488b12743f129bbbc3bb885fa0f9e4 Mon Sep 17 00:00:00 2001 From: spilkey-cisco <110940806+spilkey-cisco@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:33:20 -0800 Subject: [PATCH] Fix system-health hardware_checker to consume fan tolerance details (#16689) Why I did it Fan tolerance checking is done through new APIs, is_under_speed and is_over_speed, which populate corresponding fields into the database. speed_tolerance is no longer used and was removed, but system-health was not updated and indicates failures: ADO: 25279165 root@sonic/# show system-health summary System status summary System status LED red_blink Services: Status: OK Hardware: Status: Not OK Reasons: Failed to get speed tolerance for fantray5.fan1 Failed to get speed tolerance for fantray5.fan0 Failed to get speed tolerance for fantray4.fan1 Failed to get speed tolerance for fantray4.fan0 Failed to get speed tolerance for fantray3.fan1 Failed to get speed tolerance for fantray3.fan0 Failed to get speed tolerance for fantray2.fan1 Failed to get speed tolerance for fantray2.fan0 Failed to get speed tolerance for fantray1.fan1 Failed to get speed tolerance for fantray1.fan0 Failed to get speed tolerance for fantray0.fan1 Failed to get speed tolerance for fantray0.fan0 Failed to get speed tolerance for PSU1.fan0 Failed to get speed tolerance for PSU0.fan0 How I did it Updated hardware_checker.py in system-health to consume new is_under_speed and is_over_speed database entries instead of speed_tolerance and hard-coded calculations. How to verify it root@sonic:/# show system-health summary System status summary System status LED green Services: Status: OK Hardware: Status: OK --- .../health_checker/hardware_checker.py | 28 ++++++++++--------- src/system-health/tests/test_system_health.py | 28 +++++++++++++++---- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/system-health/health_checker/hardware_checker.py b/src/system-health/health_checker/hardware_checker.py index 8f7a11f55c2e..113fd88663a9 100644 --- a/src/system-health/health_checker/hardware_checker.py +++ b/src/system-health/health_checker/hardware_checker.py @@ -102,37 +102,39 @@ def _check_fan_status(self, config): if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'): speed = data_dict.get('speed', None) speed_target = data_dict.get('speed_target', None) - speed_tolerance = data_dict.get('speed_tolerance', None) + is_under_speed = data_dict.get('is_under_speed', None) + is_over_speed = data_dict.get('is_over_speed', None) if not speed: self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name)) continue elif not speed_target: self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name)) continue - elif not speed_tolerance: - self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name)) + elif is_under_speed is None: + self.set_object_not_ok('Fan', name, 'Failed to get under speed threshold check for {}'.format(name)) + continue + elif is_over_speed is None: + self.set_object_not_ok('Fan', name, 'Failed to get over speed threshold check for {}'.format(name)) continue else: try: speed = float(speed) speed_target = float(speed_target) - speed_tolerance = float(speed_tolerance) - speed_min_th = speed_target * (1 - float(speed_tolerance) / 100) - speed_max_th = speed_target * (1 + float(speed_tolerance) / 100) - if speed < speed_min_th or speed > speed_max_th: + if 'true' in (is_under_speed.lower(), is_over_speed.lower()): self.set_object_not_ok('Fan', name, - '{} speed is out of range, speed={}, range=[{},{}]'.format(name, - speed, - speed_min_th, - speed_max_th)) + '{} speed is out of range, speed={}, target={}'.format( + name, + speed, + speed_target)) continue except ValueError: self.set_object_not_ok('Fan', name, - 'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format( + 'Invalid fan speed data for {}, speed={}, target={}, is_under_speed={}, is_over_speed={}'.format( name, speed, speed_target, - speed_tolerance)) + is_under_speed, + is_over_speed)) continue if not self._ignore_check(config.ignore_devices, 'fan', name, 'direction'): diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index c2d782230749..67f819ecc5ff 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -298,7 +298,8 @@ def test_hardware_checker(): 'status': 'True', 'speed': '60', 'speed_target': '60', - 'speed_tolerance': '20', + 'is_under_speed': 'False', + 'is_over_speed': 'False', 'direction': 'intake' }, 'FAN_INFO|fan2': { @@ -306,28 +307,40 @@ def test_hardware_checker(): 'status': 'True', 'speed': '60', 'speed_target': '60', - 'speed_tolerance': '20' + 'is_under_speed': 'False', + 'is_over_speed': 'False', }, 'FAN_INFO|fan3': { 'presence': 'True', 'status': 'False', 'speed': '60', 'speed_target': '60', - 'speed_tolerance': '20' + 'is_under_speed': 'False', + 'is_over_speed': 'False', }, 'FAN_INFO|fan4': { 'presence': 'True', 'status': 'True', 'speed': '20', 'speed_target': '60', - 'speed_tolerance': '20' + 'is_under_speed': 'True', + 'is_over_speed': 'False', }, 'FAN_INFO|fan5': { + 'presence': 'True', + 'status': 'True', + 'speed': '90', + 'speed_target': '60', + 'is_under_speed': 'False', + 'is_over_speed': 'True', + }, + 'FAN_INFO|fan6': { 'presence': 'True', 'status': 'True', 'speed': '60', 'speed_target': '60', - 'speed_tolerance': '20', + 'is_under_speed': 'False', + 'is_over_speed': 'False', 'direction': 'exhaust' } }) @@ -426,7 +439,10 @@ def test_hardware_checker(): assert 'fan5' in checker._info assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK - assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan5 direction exhaust is not aligned with fan1 direction intake' + + assert 'fan6' in checker._info + assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan6 direction exhaust is not aligned with fan1 direction intake' assert 'PSU 1' in checker._info assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK