Skip to content

Commit

Permalink
[PSU power threshold] Fix logic error: compare the system power with …
Browse files Browse the repository at this point in the history
…the PSU's power threshold (#367)

* Adjust the logic to check PSU power threshold

Check the system power instead of PSU power against a single PSU's threshold

Signed-off-by: Stephen Sun <stephens@nvidia.com>

* Log only once instead of logging per PSU

Signed-off-by: Stephen Sun <stephens@nvidia.com>

---------

Signed-off-by: Stephen Sun <stephens@nvidia.com>
  • Loading branch information
stephenxs authored Jul 7, 2023
1 parent 81048cd commit 66f981d
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 14 deletions.
24 changes: 19 additions & 5 deletions sonic-psud/scripts/psud
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ class DaemonPsud(daemon_base.DaemonBase):
self.psu_tbl = None
self.psu_chassis_info = None
self.first_run = True
self.psu_threshold_exceeded_logged = False

global platform_psuutil
global platform_chassis
Expand Down Expand Up @@ -458,6 +459,7 @@ class DaemonPsud(daemon_base.DaemonBase):
if not platform_chassis:
return

self.psu_threshold_exceeded_logged = False
for index, psu in enumerate(platform_chassis.get_all_psus()):
try:
self._update_single_psu_data(index + 1, psu)
Expand Down Expand Up @@ -535,25 +537,37 @@ class DaemonPsud(daemon_base.DaemonBase):
power_warning_suppress_threshold = try_get(psu.get_psu_power_warning_suppress_threshold, NOT_AVAILABLE)
power_critical_threshold = try_get(psu.get_psu_power_critical_threshold, NOT_AVAILABLE)
if psu_status.check_psu_power_threshold:
# Calculate total power
system_power = float(power)
for _, other_psu in enumerate(platform_chassis.get_all_psus()):
if other_psu is psu:
# Skip the current PSU
continue
power_str = try_get(other_psu.get_power, NOT_AVAILABLE)
if power_str != NOT_AVAILABLE:
system_power += float(power_str)

if power_warning_suppress_threshold == NOT_AVAILABLE or power_critical_threshold == NOT_AVAILABLE:
self.log_error("PSU power thresholds become invalid: threshold {} critical threshold {}".format(power_warning_suppress_threshold, power_critical_threshold))
psu_status.check_psu_power_threshold = False
psu_status.power_exceeded_threshold = False
elif psu_status.power_exceeded_threshold:
# The failing threshold is the warning threshold
if power < power_warning_suppress_threshold:
if system_power < power_warning_suppress_threshold:
# Clear alarm
power_exceeded_threshold = False
else:
# The rising threshold is the critical threshold
if power >= power_critical_threshold:
if system_power >= power_critical_threshold:
# Raise alarm
power_exceeded_threshold = True

if psu_status.set_power_exceed_threshold(power_exceeded_threshold):
if psu_status.set_power_exceed_threshold(power_exceeded_threshold) and not self.psu_threshold_exceeded_logged:
# Since this is a system level PSU power exceeding check, we do not need to log it for each PSU
log_on_status_changed(self, not psu_status.power_exceeded_threshold,
'PSU power warning cleared: {} power {} is back to normal.'.format(name, power),
'PSU power warning: {} power {} exceeds critical threshold {}.'.format(name, power, power_critical_threshold))
'PSU power warning cleared: system power {} is back to normal, below the warning suppress threshold {}.'.format(system_power, power_warning_suppress_threshold),
'PSU power warning: system power {} exceeds the critical threshold {}.'.format(system_power, power_critical_threshold))
self.psu_threshold_exceeded_logged = True

if presence and psu_status.set_voltage(voltage, voltage_high_threshold, voltage_low_threshold):
set_led = True
Expand Down
21 changes: 12 additions & 9 deletions sonic-psud/tests/test_DaemonPsud.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,22 @@ def test_power_threshold(self):
psu = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234')
psud.platform_chassis = MockChassis()
psud.platform_chassis._psu_list.append(psu)
another_psu = MockPsu('PSU 2', 0, True, 'Fake Model', '12345678', '1234')
another_psu.set_power(10.0)
psud.platform_chassis._psu_list.append(another_psu)

daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER)

daemon_psud.psu_tbl = mock.MagicMock()
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0)
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=110.0)
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=130.0)
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=120.0)

# Normal start. All good and all thresholds are supported
# Power is in normal range (below warning threshold)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(100.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(100.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -213,7 +216,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(115.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -224,7 +227,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(125.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -235,7 +238,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(115.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -246,7 +249,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(105.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
daemon_psud._update_led_color()
Expand All @@ -257,7 +260,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(125.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -268,7 +271,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(105.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand Down

0 comments on commit 66f981d

Please sign in to comment.