Skip to content

Commit

Permalink
[thermal fix] 1. Catch exception for each update iteration; 2. add un…
Browse files Browse the repository at this point in the history
…it test (sonic-net#51)
  • Loading branch information
Junchao-Mellanox authored Mar 17, 2020
1 parent fc455a7 commit 97e40ce
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 32 deletions.
82 changes: 51 additions & 31 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def try_get(callback, default=NOT_AVAILABLE):
"""
try:
ret = callback()
if ret is None:
ret = default
except NotImplementedError:
ret = default

Expand Down Expand Up @@ -174,16 +176,19 @@ class FanUpdater(object):
:return:
"""
logger.log_debug("Start fan updating")
try:
for index, fan in enumerate(self.chassis.get_all_fans()):
for index, fan in enumerate(self.chassis.get_all_fans()):
try:
self._refresh_fan_status(fan, index)
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
for fan_index, fan in enumerate(psu.get_all_fans()):
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
for fan_index, fan in enumerate(psu.get_all_fans()):
try:
self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name))
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))
except Exception as e:
logger.log_warning('Failed to update PSU FAN status - {}'.format(e))

logger.log_debug("End fan updating")

Expand All @@ -201,10 +206,18 @@ class FanUpdater(object):

fan_status = self.fan_status_dict[fan_name]

speed = NOT_AVAILABLE
speed_tolerance = NOT_AVAILABLE
speed_target = NOT_AVAILABLE
fan_fault_status = NOT_AVAILABLE
fan_direction = NOT_AVAILABLE
presence = try_get(fan.get_presence, False)
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
if presence:
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
fan_fault_status = try_get(fan.get_status, False)
fan_direction = try_get(fan.get_direction)

set_led = False
if fan_status.set_presence(presence):
Expand All @@ -215,15 +228,15 @@ class FanUpdater(object):
'the system, potential overheat hazard'.format(fan_name)
)

if fan_status.set_under_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.under_speed,
'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name),
'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'.
format(fan_name, speed, speed_target, speed_tolerance)
)

if fan_status.set_over_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.over_speed,
'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name),
Expand All @@ -240,8 +253,8 @@ class FanUpdater(object):
[('presence', str(presence)),
('model', str(try_get(fan.get_model))),
('serial', str(try_get(fan.get_serial))),
('status', str(try_get(fan.get_status, False))),
('direction', str(try_get(fan.get_direction))),
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
Expand Down Expand Up @@ -378,12 +391,12 @@ class TemperatureUpdater(object):
Update all temperature information to database
:return:
"""
logger.log_debug("Start temperature updating")
try:
for index, thermal in enumerate(self.chassis.get_all_thermals()):
logger.log_debug("Start temperature updating")
for index, thermal in enumerate(self.chassis.get_all_thermals()):
try:
self._refresh_temperature_status(thermal, index)
except Exception as e:
logger.log_warning('Failed to update thermal status - {}'.format(e))
except Exception as e:
logger.log_warning('Failed to update thermal status - {}'.format(e))

logger.log_debug("End temperature updating")

Expand All @@ -400,26 +413,33 @@ class TemperatureUpdater(object):

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
temperature_status.set_temperature(name, temperature)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)

if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature_status.set_over_temperature(temperature, high_threshold):
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning cleared: {} temperature restore to {}C, high threshold {}C.'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature_status.set_under_temperature(temperature, low_threshold):
if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning cleared: {} temperature restore to {}C, low threshold {}C.'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature
Expand All @@ -429,8 +449,8 @@ class TemperatureUpdater(object):
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(try_get(thermal.get_high_critical_threshold))),
('critical_low_threshold', str(try_get(thermal.get_low_critical_threshold))),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

Expand Down
18 changes: 18 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def make_normal_speed(self):
self.speed_tolerance = 0


class MockErrorFan(MockFan):
def get_speed(self):
raise Exception('Fail to get speed')


class MockPsu(MockDevice):
def __init__(self):
self.fan_list = []
Expand Down Expand Up @@ -118,6 +123,11 @@ def make_normal_temper(self):
self.temperature = 2
self.low_threshold = 1


class MockErrorThermal(MockThermal):
def get_temperature(self):
raise Exception('Fail to get temperature')


class MockChassis:
def __init__(self):
Expand Down Expand Up @@ -149,6 +159,10 @@ def make_over_speed_fan(self):
fan.make_over_speed()
self.fan_list.append(fan)

def make_error_fan(self):
fan = MockErrorFan()
self.fan_list.append(fan)

def make_over_temper_thermal(self):
thermal = MockThermal()
thermal.make_over_temper()
Expand All @@ -159,3 +173,7 @@ def make_under_temper_thermal(self):
thermal.make_under_temper()
self.thermal_list.append(thermal)

def make_error_thermal(self):
thermal = MockErrorThermal()
self.thermal_list.append(thermal)

26 changes: 25 additions & 1 deletion sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
from mock import Mock, MagicMock, patch
from sonic_daemon_base import daemon_base
from .mock_platform import MockChassis, MockFan
from .mock_platform import MockChassis, MockFan, MockThermal

daemon_base.db_connect = MagicMock()

Expand Down Expand Up @@ -198,3 +198,27 @@ def test_temperupdater_under_temper():
temperature_updater.update()
logger.log_notice.assert_called_once()


def test_update_fan_with_exception():
chassis = MockChassis()
chassis.make_error_fan()
fan = MockFan()
fan.make_over_speed()
chassis.get_all_fans().append(fan)

fan_updater = FanUpdater(chassis)
fan_updater.update()
assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED
logger.log_warning.assert_called()


def test_update_thermal_with_exception():
chassis = MockChassis()
chassis.make_error_thermal()
thermal = MockThermal()
thermal.make_over_temper()
chassis.get_all_thermals().append(thermal)

temperature_updater = TemperatureUpdater(chassis)
temperature_updater.update()
logger.log_warning.assert_called()

0 comments on commit 97e40ce

Please sign in to comment.