Skip to content

Commit cc3803f

Browse files
[thermalctld] Enable stopping thermal manager (sonic-net#180)
Make thermalctld exit as soon as possible. Depends on sonic-net/sonic-platform-common#187 During config reload flow, supervisord sends SIGTERM to thermalctld. However, if thermalctld cannot exit in 10 seconds, supervisord sends SITKILL to thermalctld. In this case, sub process of thermalctld might still stay in memory as a zombie process. This PR is aimed to fix this.
1 parent 665fcd9 commit cc3803f

File tree

2 files changed

+55
-25
lines changed

2 files changed

+55
-25
lines changed

sonic-thermalctld/scripts/thermalctld

+24-4
Original file line numberDiff line numberDiff line change
@@ -191,14 +191,15 @@ class FanUpdater(logger.Logger):
191191
FAN_INFO_TABLE_NAME = 'FAN_INFO'
192192
FAN_DRAWER_INFO_TABLE_NAME = 'FAN_DRAWER_INFO'
193193

194-
def __init__(self, chassis):
194+
def __init__(self, chassis, task_stopping_event):
195195
"""
196196
Initializer for FanUpdater
197197
:param chassis: Object representing a platform chassis
198198
"""
199199
super(FanUpdater, self).__init__(SYSLOG_IDENTIFIER)
200200

201201
self.chassis = chassis
202+
self.task_stopping_event = task_stopping_event
202203
self.fan_status_dict = {}
203204
state_db = daemon_base.db_connect("STATE_DB")
204205
self.table = swsscommon.Table(state_db, FanUpdater.FAN_INFO_TABLE_NAME)
@@ -236,15 +237,21 @@ class FanUpdater(logger.Logger):
236237
FanStatus.reset_fan_counter()
237238

238239
for drawer_index, drawer in enumerate(self.chassis.get_all_fan_drawers()):
240+
if self.task_stopping_event.is_set():
241+
return
239242
self._refresh_fan_drawer_status(drawer, drawer_index)
240243
for fan_index, fan in enumerate(drawer.get_all_fans()):
244+
if self.task_stopping_event.is_set():
245+
return
241246
try:
242247
self._refresh_fan_status(drawer, drawer_index, fan, fan_index)
243248
except Exception as e:
244249
self.log_warning('Failed to update fan status - {}'.format(repr(e)))
245250

246251
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
247252
for fan_index, fan in enumerate(psu.get_all_fans()):
253+
if self.task_stopping_event.is_set():
254+
return
248255
try:
249256
self._refresh_fan_status(psu, psu_index, fan, fan_index, True)
250257
except Exception as e:
@@ -396,6 +403,8 @@ class FanUpdater(logger.Logger):
396403

397404
def _update_led_color(self):
398405
for fan_name, fan_status in self.fan_status_dict.items():
406+
if self.task_stopping_event.is_set():
407+
return
399408
try:
400409
fvs = swsscommon.FieldValuePairs([
401410
('led_status', str(try_get(fan_status.fan.get_status_led)))
@@ -408,6 +417,8 @@ class FanUpdater(logger.Logger):
408417
self.table.set(fan_name, fvs)
409418

410419
for drawer in self.chassis.get_all_fan_drawers():
420+
if self.task_stopping_event.is_set():
421+
return
411422
drawer_name = try_get(drawer.get_name)
412423
if drawer_name == NOT_AVAILABLE:
413424
continue
@@ -510,14 +521,15 @@ class TemperatureUpdater(logger.Logger):
510521
# Temperature information table name in database
511522
TEMPER_INFO_TABLE_NAME = 'TEMPERATURE_INFO'
512523

513-
def __init__(self, chassis):
524+
def __init__(self, chassis, task_stopping_event):
514525
"""
515526
Initializer of TemperatureUpdater
516527
:param chassis: Object representing a platform chassis
517528
"""
518529
super(TemperatureUpdater, self).__init__(SYSLOG_IDENTIFIER)
519530

520531
self.chassis = chassis
532+
self.task_stopping_event = task_stopping_event
521533
self.temperature_status_dict = {}
522534
state_db = daemon_base.db_connect("STATE_DB")
523535
self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME)
@@ -562,6 +574,8 @@ class TemperatureUpdater(logger.Logger):
562574
"""
563575
self.log_debug("Start temperature updating")
564576
for index, thermal in enumerate(self.chassis.get_all_thermals()):
577+
if self.task_stopping_event.is_set():
578+
return
565579
try:
566580
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
567581
except Exception as e:
@@ -570,6 +584,8 @@ class TemperatureUpdater(logger.Logger):
570584
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
571585
parent_name = 'PSU {}'.format(psu_index + 1)
572586
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
587+
if self.task_stopping_event.is_set():
588+
return
573589
try:
574590
self._refresh_temperature_status(parent_name, thermal, thermal_index)
575591
except Exception as e:
@@ -578,6 +594,8 @@ class TemperatureUpdater(logger.Logger):
578594
for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
579595
parent_name = 'SFP {}'.format(sfp_index + 1)
580596
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
597+
if self.task_stopping_event.is_set():
598+
return
581599
try:
582600
self._refresh_temperature_status(parent_name, thermal, thermal_index)
583601
except Exception as e:
@@ -686,8 +704,8 @@ class ThermalMonitor(ProcessTaskBase):
686704
# Set minimum logging level to INFO
687705
self.logger.set_min_log_priority_info()
688706

689-
self.fan_updater = FanUpdater(chassis)
690-
self.temperature_updater = TemperatureUpdater(chassis)
707+
self.fan_updater = FanUpdater(chassis, self.task_stopping_event)
708+
self.temperature_updater = TemperatureUpdater(chassis, self.task_stopping_event)
691709

692710
def main(self):
693711
begin = time.time()
@@ -789,6 +807,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
789807
if sig in FATAL_SIGNALS:
790808
self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig]))
791809
exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
810+
self.thermal_monitor.task_stop()
811+
self.thermal_manager.stop()
792812
self.stop_event.set()
793813
elif sig in NONFATAL_SIGNALS:
794814
self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))

sonic-thermalctld/tests/test_thermalctld.py

+31-21
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import sys
3+
import multiprocessing
34
from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2
45

56
# TODO: Clean this up once we no longer need to support Python 2
@@ -148,7 +149,7 @@ class TestFanUpdater(object):
148149
Test cases to cover functionality in FanUpdater class
149150
"""
150151
def test_deinit(self):
151-
fan_updater = thermalctld.FanUpdater(MockChassis())
152+
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
152153
fan_updater.fan_status_dict = {'key1': 'value1', 'key2': 'value2'}
153154
fan_updater.table._del = mock.MagicMock()
154155

@@ -161,7 +162,7 @@ def test_deinit(self):
161162
@mock.patch('thermalctld.update_entity_info', mock.MagicMock())
162163
def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self):
163164
# Test case where fan_drawer.get_name is not implemented
164-
fan_updater = thermalctld.FanUpdater(MockChassis())
165+
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
165166
mock_fan_drawer = mock.MagicMock()
166167
fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1)
167168
assert thermalctld.update_entity_info.call_count == 0
@@ -175,7 +176,7 @@ def test_update_fan_with_exception(self):
175176
fan.make_over_speed()
176177
chassis.get_all_fans().append(fan)
177178

178-
fan_updater = thermalctld.FanUpdater(chassis)
179+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
179180
fan_updater.update()
180181
assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED
181182
assert fan_updater.log_warning.call_count == 1
@@ -192,15 +193,15 @@ def test_set_fan_led_exception(self):
192193
mock_fan = MockFan()
193194
mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError)
194195

195-
fan_updater = thermalctld.FanUpdater(MockChassis())
196+
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
196197
fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status)
197198
assert fan_updater.log_warning.call_count == 1
198199
fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented')
199200

200201
def test_fan_absent(self):
201202
chassis = MockChassis()
202203
chassis.make_absent_fan()
203-
fan_updater = thermalctld.FanUpdater(chassis)
204+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
204205
fan_updater.update()
205206
fan_list = chassis.get_all_fans()
206207
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
@@ -224,7 +225,7 @@ def test_fan_absent(self):
224225
def test_fan_faulty(self):
225226
chassis = MockChassis()
226227
chassis.make_faulty_fan()
227-
fan_updater = thermalctld.FanUpdater(chassis)
228+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
228229
fan_updater.update()
229230
fan_list = chassis.get_all_fans()
230231
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
@@ -248,7 +249,7 @@ def test_fan_faulty(self):
248249
def test_fan_under_speed(self):
249250
chassis = MockChassis()
250251
chassis.make_under_speed_fan()
251-
fan_updater = thermalctld.FanUpdater(chassis)
252+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
252253
fan_updater.update()
253254
fan_list = chassis.get_all_fans()
254255
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
@@ -264,7 +265,7 @@ def test_fan_under_speed(self):
264265
def test_fan_over_speed(self):
265266
chassis = MockChassis()
266267
chassis.make_over_speed_fan()
267-
fan_updater = thermalctld.FanUpdater(chassis)
268+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
268269
fan_updater.update()
269270
fan_list = chassis.get_all_fans()
270271
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
@@ -283,7 +284,7 @@ def test_update_psu_fans(self):
283284
mock_fan = MockFan()
284285
psu._fan_list.append(mock_fan)
285286
chassis._psu_list.append(psu)
286-
fan_updater = thermalctld.FanUpdater(chassis)
287+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
287288
fan_updater.update()
288289
assert fan_updater.log_warning.call_count == 0
289290

@@ -331,7 +332,7 @@ def test_insufficient_fan_number():
331332
chassis = MockChassis()
332333
chassis.make_absent_fan()
333334
chassis.make_faulty_fan()
334-
fan_updater = thermalctld.FanUpdater(chassis)
335+
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
335336
fan_updater.update()
336337
assert fan_updater.log_warning.call_count == 3
337338
expected_calls = [
@@ -415,19 +416,20 @@ class TestTemperatureUpdater(object):
415416
"""
416417
def test_deinit(self):
417418
chassis = MockChassis()
418-
temp_updater = thermalctld.TemperatureUpdater(chassis)
419+
temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
419420
temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'}
420421
temp_updater.table._del = mock.MagicMock()
421422

422423
temp_updater.deinit()
423424
assert temp_updater.table._del.call_count == 2
424425
expected_calls = [mock.call('key1'), mock.call('key2')]
425426
temp_updater.table._del.assert_has_calls(expected_calls, any_order=True)
427+
426428

427429
def test_over_temper(self):
428430
chassis = MockChassis()
429431
chassis.make_over_temper_thermal()
430-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
432+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
431433
temperature_updater.update()
432434
thermal_list = chassis.get_all_thermals()
433435
assert temperature_updater.log_warning.call_count == 1
@@ -441,7 +443,7 @@ def test_over_temper(self):
441443
def test_under_temper(self):
442444
chassis = MockChassis()
443445
chassis.make_under_temper_thermal()
444-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
446+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
445447
temperature_updater.update()
446448
thermal_list = chassis.get_all_thermals()
447449
assert temperature_updater.log_warning.call_count == 1
@@ -458,7 +460,7 @@ def test_update_psu_thermals(self):
458460
mock_thermal = MockThermal()
459461
psu._thermal_list.append(mock_thermal)
460462
chassis._psu_list.append(psu)
461-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
463+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
462464
temperature_updater.update()
463465
assert temperature_updater.log_warning.call_count == 0
464466

@@ -478,7 +480,7 @@ def test_update_sfp_thermals(self):
478480
mock_thermal = MockThermal()
479481
sfp._thermal_list.append(mock_thermal)
480482
chassis._sfp_list.append(sfp)
481-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
483+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
482484
temperature_updater.update()
483485
assert temperature_updater.log_warning.call_count == 0
484486

@@ -499,7 +501,7 @@ def test_update_thermal_with_exception(self):
499501
thermal.make_over_temper()
500502
chassis.get_all_thermals().append(thermal)
501503

502-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
504+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
503505
temperature_updater.update()
504506
assert temperature_updater.log_warning.call_count == 2
505507

@@ -524,17 +526,17 @@ def test_updater_thermal_check_modular_chassis():
524526
chassis = MockChassis()
525527
assert chassis.is_modular_chassis() == False
526528

527-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
529+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
528530
assert temperature_updater.chassis_table == None
529531

530532
chassis.set_modular_chassis(True)
531533
chassis.set_my_slot(-1)
532-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
534+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
533535
assert temperature_updater.chassis_table == None
534536

535537
my_slot = 1
536538
chassis.set_my_slot(my_slot)
537-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
539+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
538540
assert temperature_updater.chassis_table != None
539541
assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot))
540542

@@ -547,7 +549,7 @@ def test_updater_thermal_check_chassis_table():
547549

548550
chassis.set_modular_chassis(True)
549551
chassis.set_my_slot(1)
550-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
552+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
551553

552554
temperature_updater.update()
553555
assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals()
@@ -566,7 +568,7 @@ def test_updater_thermal_check_min_max():
566568

567569
chassis.set_modular_chassis(True)
568570
chassis.set_my_slot(1)
569-
temperature_updater = thermalctld.TemperatureUpdater(chassis)
571+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
570572

571573
temperature_updater.update()
572574
slot_dict = temperature_updater.chassis_table.get(thermal.get_name())
@@ -580,26 +582,30 @@ def test_signal_handler():
580582
daemon_thermalctld.stop_event.set = mock.MagicMock()
581583
daemon_thermalctld.log_info = mock.MagicMock()
582584
daemon_thermalctld.log_warning = mock.MagicMock()
585+
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
583586
daemon_thermalctld.signal_handler(thermalctld.signal.SIGHUP, None)
584587
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
585588
assert daemon_thermalctld.log_info.call_count == 1
586589
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...")
587590
assert daemon_thermalctld.log_warning.call_count == 0
588591
assert daemon_thermalctld.stop_event.set.call_count == 0
592+
assert daemon_thermalctld.thermal_manager.stop.call_count == 0
589593
assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN
590594

591595
# Test SIGINT
592596
daemon_thermalctld = thermalctld.ThermalControlDaemon()
593597
daemon_thermalctld.stop_event.set = mock.MagicMock()
594598
daemon_thermalctld.log_info = mock.MagicMock()
595599
daemon_thermalctld.log_warning = mock.MagicMock()
600+
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
596601
test_signal = thermalctld.signal.SIGINT
597602
daemon_thermalctld.signal_handler(test_signal, None)
598603
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
599604
assert daemon_thermalctld.log_info.call_count == 1
600605
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...")
601606
assert daemon_thermalctld.log_warning.call_count == 0
602607
assert daemon_thermalctld.stop_event.set.call_count == 1
608+
assert daemon_thermalctld.thermal_manager.stop.call_count == 1
603609
assert thermalctld.exit_code == (128 + test_signal)
604610

605611
# Test SIGTERM
@@ -608,13 +614,15 @@ def test_signal_handler():
608614
daemon_thermalctld.stop_event.set = mock.MagicMock()
609615
daemon_thermalctld.log_info = mock.MagicMock()
610616
daemon_thermalctld.log_warning = mock.MagicMock()
617+
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
611618
test_signal = thermalctld.signal.SIGTERM
612619
daemon_thermalctld.signal_handler(test_signal, None)
613620
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
614621
assert daemon_thermalctld.log_info.call_count == 1
615622
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...")
616623
assert daemon_thermalctld.log_warning.call_count == 0
617624
assert daemon_thermalctld.stop_event.set.call_count == 1
625+
assert daemon_thermalctld.thermal_manager.stop.call_count == 1
618626
assert thermalctld.exit_code == (128 + test_signal)
619627

620628
# Test an unhandled signal
@@ -623,12 +631,14 @@ def test_signal_handler():
623631
daemon_thermalctld.stop_event.set = mock.MagicMock()
624632
daemon_thermalctld.log_info = mock.MagicMock()
625633
daemon_thermalctld.log_warning = mock.MagicMock()
634+
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
626635
daemon_thermalctld.signal_handler(thermalctld.signal.SIGUSR1, None)
627636
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
628637
assert daemon_thermalctld.log_warning.call_count == 1
629638
daemon_thermalctld.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...")
630639
assert daemon_thermalctld.log_info.call_count == 0
631640
assert daemon_thermalctld.stop_event.set.call_count == 0
641+
assert daemon_thermalctld.thermal_manager.stop.call_count == 0
632642
assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN
633643

634644

0 commit comments

Comments
 (0)