From ed818f8dd533804ee176cc1fac65aacdcea50a3c Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:01:23 +0800 Subject: [PATCH] [PSU daemon] Support PSU power threshold checking (#288) --- sonic-psud/scripts/psud | 52 ++++++++- sonic-psud/tests/mock_platform.py | 6 + sonic-psud/tests/test_DaemonPsud.py | 167 ++++++++++++++++++++++++++-- 3 files changed, 213 insertions(+), 12 deletions(-) diff --git a/sonic-psud/scripts/psud b/sonic-psud/scripts/psud index a8c529c62910..96ba57935b74 100644 --- a/sonic-psud/scripts/psud +++ b/sonic-psud/scripts/psud @@ -57,6 +57,9 @@ PSU_INFO_VOLTAGE_MAX_TH_FIELD = 'voltage_max_threshold' PSU_INFO_VOLTAGE_MIN_TH_FIELD = 'voltage_min_threshold' PSU_INFO_CURRENT_FIELD = 'current' PSU_INFO_POWER_FIELD = 'power' +PSU_INFO_POWER_OVERLOAD = 'power_overload' +PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD = 'power_warning_suppress_threshold' +PSU_INFO_POWER_CRITICAL_THRESHOLD = 'power_critical_threshold' PSU_INFO_FRU_FIELD = 'is_replaceable' PSU_INFO_IN_VOLTAGE_FIELD = 'input_voltage' PSU_INFO_IN_CURRENT_FIELD = 'input_current' @@ -283,6 +286,8 @@ class PsuStatus(object): self.power_good = True self.voltage_good = True self.temperature_good = True + self.check_psu_power_threshold = False + self.power_exceeded_threshold = False self.logger = logger def set_presence(self, presence): @@ -339,6 +344,13 @@ class PsuStatus(object): self.temperature_good = temperature_good return True + def set_power_exceed_threshold(self, power_exceeded_threshold): + if power_exceeded_threshold == self.power_exceeded_threshold: + return False + + self.power_exceeded_threshold = power_exceeded_threshold + return True + def is_ok(self): return self.presence and self.power_good and self.voltage_good and self.temperature_good @@ -486,6 +498,8 @@ class DaemonPsud(daemon_base.DaemonBase): 'PSU absence warning cleared: {} is inserted back.'.format(name), 'PSU absence warning: {} is not present.'.format(name) ) + if not psu_status.presence: + psu_status.check_psu_power_threshold = False if presence_changed or self.first_run: # Have to update PSU fan data here because PSU presence status changed. If we don't @@ -495,13 +509,46 @@ class DaemonPsud(daemon_base.DaemonBase): # every 60 seconds, it may still treat PSU state to "OK" and PSU LED to "red". self._update_psu_fan_data(psu, index) - if presence and psu_status.set_power_good(power_good): + power_good_changed = psu_status.set_power_good(power_good) + if presence and power_good_changed: set_led = True log_on_status_changed(self, psu_status.power_good, 'Power absence warning cleared: {} power is back to normal.'.format(name), 'Power absence warning: {} is out of power.'.format(name) ) + if presence and power_good_changed or self.first_run: + psu_status.check_psu_power_threshold = False + if psu_status.power_good: + # power_good has been updated and it is True, which means it was False + # Initialize power exceeding threshold state in this case + if (try_get(psu.get_psu_power_critical_threshold) and try_get(psu.get_psu_power_warning_suppress_threshold) and power != NOT_AVAILABLE): + psu_status.check_psu_power_threshold = True + + power_exceeded_threshold = psu_status.power_exceeded_threshold + power_warning_suppress_threshold = try_get(psu.get_psu_power_warning_suppress_threshold, NOT_AVAILABLE) + power_critical_threshold = try_get(psu.get_psu_power_critical_threshold, NOT_AVAILABLE) + if psu_status.check_psu_power_threshold: + if power_warning_suppress_threshold == NOT_AVAILABLE or power_critical_threshold == NOT_AVAILABLE: + self.log_error("PSU power thresholds become invalid: threshold {} critical threshold {}".format(power_warning_suppress_threshold, power_critical_threshold)) + psu_status.check_psu_power_threshold = False + psu_status.power_exceeded_threshold = False + elif psu_status.power_exceeded_threshold: + # The failing threshold is the warning threshold + if power < power_warning_suppress_threshold: + # Clear alarm + power_exceeded_threshold = False + else: + # The rising threshold is the critical threshold + if power >= power_critical_threshold: + # Raise alarm + power_exceeded_threshold = True + + if psu_status.set_power_exceed_threshold(power_exceeded_threshold): + log_on_status_changed(self, not psu_status.power_exceeded_threshold, + 'PSU power warning cleared: {} power {} is back to normal.'.format(name, power), + 'PSU power warning: {} power {} exceeds critical threshold {}.'.format(name, power, power_critical_threshold)) + if presence and psu_status.set_voltage(voltage, voltage_high_threshold, voltage_low_threshold): set_led = True log_on_status_changed(self, psu_status.voltage_good, @@ -532,6 +579,9 @@ class DaemonPsud(daemon_base.DaemonBase): (PSU_INFO_VOLTAGE_MAX_TH_FIELD, str(voltage_high_threshold)), (PSU_INFO_CURRENT_FIELD, str(current)), (PSU_INFO_POWER_FIELD, str(power)), + (PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD, str(power_warning_suppress_threshold)), + (PSU_INFO_POWER_CRITICAL_THRESHOLD, str(power_critical_threshold)), + (PSU_INFO_POWER_OVERLOAD, str(power_exceeded_threshold)), (PSU_INFO_FRU_FIELD, str(is_replaceable)), (PSU_INFO_IN_CURRENT_FIELD, str(in_current)), (PSU_INFO_IN_VOLTAGE_FIELD, str(in_voltage)), diff --git a/sonic-psud/tests/mock_platform.py b/sonic-psud/tests/mock_platform.py index 2294533d619e..5db3e394ff1a 100644 --- a/sonic-psud/tests/mock_platform.py +++ b/sonic-psud/tests/mock_platform.py @@ -356,6 +356,12 @@ def set_status_led(self, color): self._status_led_color = color return True + def get_psu_power_critical_threshold(self): + raise NotImplementedError + + def get_psu_power_warning_suppress_threshold(self): + raise NotImplementedError + # Methods inherited from DeviceBase class and related setters def get_name(self): return self._name diff --git a/sonic-psud/tests/test_DaemonPsud.py b/sonic-psud/tests/test_DaemonPsud.py index f86a91231d07..482eb1cdd165 100644 --- a/sonic-psud/tests/test_DaemonPsud.py +++ b/sonic-psud/tests/test_DaemonPsud.py @@ -143,16 +143,7 @@ def test_update_psu_data(self): expected_calls = [mock.call("Failed to update PSU data - Test message")] * 2 assert daemon_psud.log_warning.mock_calls == expected_calls - @mock.patch('psud._wrapper_get_psu_presence', mock.MagicMock()) - @mock.patch('psud._wrapper_get_psu_status', mock.MagicMock()) - def test_update_single_psu_data(self): - psud._wrapper_get_psu_presence.return_value = True - psud._wrapper_get_psu_status.return_value = True - - psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234') - psud.platform_chassis = MockChassis() - psud.platform_chassis._psu_list.append(psu1) - + def _construct_expected_fvp(self, power=100.0, power_warning_suppress_threshold='N/A', power_critical_threshold='N/A', power_overload=False): expected_fvp = psud.swsscommon.FieldValuePairs( [(psud.PSU_INFO_MODEL_FIELD, 'Fake Model'), (psud.PSU_INFO_SERIAL_FIELD, '12345678'), @@ -163,17 +154,171 @@ def test_update_single_psu_data(self): (psud.PSU_INFO_VOLTAGE_MIN_TH_FIELD, '11.0'), (psud.PSU_INFO_VOLTAGE_MAX_TH_FIELD, '13.0'), (psud.PSU_INFO_CURRENT_FIELD, '8.0'), - (psud.PSU_INFO_POWER_FIELD, '100.0'), + (psud.PSU_INFO_POWER_FIELD, str(power)), + (psud.PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD, str(power_warning_suppress_threshold)), + (psud.PSU_INFO_POWER_CRITICAL_THRESHOLD, str(power_critical_threshold)), + (psud.PSU_INFO_POWER_OVERLOAD, str(power_overload)), (psud.PSU_INFO_FRU_FIELD, 'True'), (psud.PSU_INFO_IN_VOLTAGE_FIELD, '220.25'), (psud.PSU_INFO_IN_CURRENT_FIELD, '0.72'), (psud.PSU_INFO_POWER_MAX_FIELD, 'N/A'), ]) + return expected_fvp + + @mock.patch('psud._wrapper_get_psu_presence', mock.MagicMock()) + @mock.patch('psud._wrapper_get_psu_status', mock.MagicMock()) + def test_update_single_psu_data(self): + psud._wrapper_get_psu_presence.return_value = True + psud._wrapper_get_psu_status.return_value = True + + psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234') + psud.platform_chassis = MockChassis() + psud.platform_chassis._psu_list.append(psu1) + + expected_fvp = self._construct_expected_fvp() daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER) daemon_psud.psu_tbl = mock.MagicMock() daemon_psud._update_single_psu_data(1, psu1) daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold + + @mock.patch('psud.daemon_base.db_connect', mock.MagicMock()) + def test_power_threshold(self): + psu = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234') + psud.platform_chassis = MockChassis() + psud.platform_chassis._psu_list.append(psu) + + daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER) + + daemon_psud.psu_tbl = mock.MagicMock() + psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0) + psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=110.0) + + # Normal start. All good and all thresholds are supported + # Power is in normal range (below warning threshold) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(100.0, 110.0, 120.0, False) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + daemon_psud.first_run = False + + # Power is increasing across the warning threshold + # Normal => (warning, critical) + psu.set_power(115.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, False) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # Power is increasing across the critical threshold. Alarm raised + # (warning, critical) => (critical, ) + psu.set_power(125.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # Power is decreasing across the critical threshold. Alarm not cleared + # (critical, ) => (warning, critical) + psu.set_power(115.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, True) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # Power is decreasing across the warning threshold. Alarm cleared + # (warning, critical) => Normal + psu.set_power(105.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + daemon_psud._update_led_color() + + # Power is increasing across the critical threshold. Alarm raised + # Normal => (critical, ) + psu.set_power(125.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # Power is increasing across the critical threshold. Alarm raised + # (critical, ) => Normal + psu.set_power(105.0) + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False) + daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp) + daemon_psud._update_led_color() + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # PSU power becomes down + psu.set_status(False) + daemon_psud._update_single_psu_data(1, psu) + daemon_psud._update_led_color() + assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + assert psu.STATUS_LED_COLOR_RED == psu.get_status_led() + + # PSU power becomes up + psu.set_status(True) + daemon_psud._update_single_psu_data(1, psu) + daemon_psud._update_led_color() + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # PSU becomes absent + psu.set_presence(False) + daemon_psud._update_single_psu_data(1, psu) + daemon_psud._update_led_color() + assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + assert psu.STATUS_LED_COLOR_RED == psu.get_status_led() + + # PSU becomes present + psu.set_presence(True) + daemon_psud._update_single_psu_data(1, psu) + daemon_psud._update_led_color() + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led() + + # Thresholds become invalid on the fly + psu.get_psu_power_critical_threshold = mock.MagicMock(side_effect=NotImplementedError('')) + daemon_psud._update_single_psu_data(1, psu) + assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0) + daemon_psud.psu_status_dict[1].check_psu_power_threshold = True + daemon_psud._update_single_psu_data(1, psu) + assert daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold + psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(side_effect=NotImplementedError('')) + daemon_psud._update_single_psu_data(1, psu) + assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold + assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold def test_set_psu_led(self): mock_logger = mock.MagicMock()