From bffad0556624d1176455ac677ed4d7b21a8728f1 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Thu, 21 Nov 2024 18:02:35 +0000 Subject: [PATCH 01/10] Thermal updater changes for smart switch --- .../sonic_platform/thermal_manager.py | 19 +- .../sonic_platform/thermal_updater.py | 159 +++++++++++++- .../tests/test_thermal_updater.py | 197 +++++++++++++++++- 3 files changed, 361 insertions(+), 14 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 3512a0cf52e5..a8c7a768f8d0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -21,6 +21,7 @@ class ThermalManager(ThermalManagerBase): thermal_updater_task = None + thermal_updaer_req = None @classmethod def run_policy(cls, chassis): @@ -33,12 +34,24 @@ def initialize(cls): and any other vendor specific initialization. :return: """ + cls.thermal_updater_req = False + sfps = [] + dpus = [] + host_mgmt_mode = False if DeviceDataManager.is_module_host_management_mode(): from .chassis import Chassis - cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps()) + sfps = Chassis.chassis_instance.get_all_sfps() + cls.thermal_updater_req = True + host_mgmt_mode = True + if DeviceDataManager.get_platform_dpus_data(): + # If DPUs are present then this if condition is reached + from .chassis import Chassis + dpus = Chassis.chassis_instance.get_all_modules() + cls.thermal_updater_req = True + if cls.thermal_updater_req: + cls.thermal_updater_task = thermal_updater.ThermalUpdater(sfps, dpus, host_mgmt_mode) cls.thermal_updater_task.start() - @classmethod def deinitialize(cls): """ @@ -46,5 +59,5 @@ def deinitialize(cls): is a no-op. :return: """ - if DeviceDataManager.is_module_host_management_mode() and cls.thermal_updater_task: + if cls.thermal_updater_req and cls.thermal_updater_task: cls.thermal_updater_task.stop() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index 889bc96d3bec..6bb2cb4d52e7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -19,7 +19,6 @@ from sonic_py_common import logger import sys -import time sys.path.append('/run/hw-management/bin') @@ -35,11 +34,36 @@ hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock() hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock() +try: + import hw_management_dpu_thermal_update +except ImportError: + # For unit test and for non-smartswitch systems, these functions should not be called + from unittest import mock + hw_management_dpu_thermal_update = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear = mock.MagicMock() SFP_TEMPERATURE_SCALE = 1000 ASIC_TEMPERATURE_SCALE = 125 ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000 ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000 +CRIT_THRESH = "critical_high_threshold" +HIGH_THRESH = "high_threshold" +TEMPERATURE_DATA = "temperature" +DPU_STATUS_OFFLINE = "Offline" +DPU_STATUS_ONLINE = "Online" +CPU_FIELD = "CPU" +NVME_FIELD = "NVME" +DDR_FIELD = "DDR" +dpu_func_dict = { + CPU_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, + NVME_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, + DDR_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, + } ERROR_READ_THERMAL_DATA = 254000 @@ -48,17 +72,50 @@ class ThermalUpdater: - def __init__(self, sfp_list): + def __init__(self, sfp_list, dpu_list=[], is_host_mgmt_mode=True): + # Default initialization is in host mgmt mode without dpus self._sfp_list = sfp_list self._sfp_status = {} self._timer = utils.Timer() + self._dpu_list = dpu_list + self._dpu_status = dpu_list + self.dpus_exist = False + if len(self._dpu_list) > 0: + self.dpus_exist = True + self._dpu_status = {} + self.dev_parameters = None + self.data = None + self.read_checked = False + self.configure_functions(self.dpus_exist, is_host_mgmt_mode) - def load_tc_config(self): - asic_poll_interval = 1 - sfp_poll_interval = 10 + def configure_functions(self, dpu, independent_mode): + self.start = self.start_independent_mode + self.stop = self.stop_independent_mode + self.load_tc_config = self.load_tc_config_asic_sfp + self.clean_thermal_data = self.clean_thermal_data_asic_sfp + if dpu: + self.clean_thermal_data = self.clean_all + self.load_tc_config = self.load_tc_config_all + if not independent_mode: + self.start = self.start_no_independent + self.stop = self.stop_no_independent_mode + self.load_tc_config = self.load_tc_config_dpu + self.clean_thermal_data = self.clean_thermal_data_dpu + + def read_tc_config_data(self): + if self.read_checked: + return self.data data = utils.load_json_file(TC_CONFIG_FILE, log_func=None) if not data: logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') + self.data = data + self.read_checked = True + return self.data + + def load_tc_config_asic_sfp(self): + asic_poll_interval = 1 + sfp_poll_interval = 10 + data = self.read_tc_config_data() if data: dev_parameters = data.get('dev_parameters') @@ -79,21 +136,48 @@ def load_tc_config(self): logger.log_notice(f'Module polling interval: {sfp_poll_interval}') self._timer.schedule(sfp_poll_interval, self.update_module) - def start(self): + def load_tc_config_dpu(self): + dpu_poll_interval = 3 + data = self.read_tc_config_data() + if data: + dev_parameters = data.get('dev_parameters', {}) + dpu_parameter = dev_parameters.get('dpu\\d+_module', {}) + dpu_poll_interval_config = dpu_parameter.get('poll_time') + dpu_poll_interval = int(dpu_poll_interval_config) / 2 if dpu_poll_interval_config else dpu_poll_interval + logger.log_notice(f'DPU polling interval: {dpu_poll_interval}') + self._timer.schedule(dpu_poll_interval, self.update_dpu) + + def load_tc_config_all(self): + self.load_tc_config_asic_sfp() + self.load_tc_config_dpu() + + def start_independent_mode(self): self.clean_thermal_data() self.control_tc(False) self.load_tc_config() self._timer.start() - def stop(self): + def start_no_independent(self): + self.clean_thermal_data() + self.load_tc_config() + self._timer.start() + + def stop_independent_mode(self): self._timer.stop() self.control_tc(True) + def stop_no_independent_mode(self): + self._timer.stop() + def control_tc(self, suspend): logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}') utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0) - def clean_thermal_data(self): + def clean_all(self): + self.clean_thermal_data_asic_sfp() + self.clean_thermal_data_dpu() + + def clean_thermal_data_asic_sfp(self): hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list)) hw_management_independent_mode_update.thermal_data_clean_asic(0) for sfp in self._sfp_list: @@ -102,6 +186,15 @@ def clean_thermal_data(self): sfp.sdk_index + 1 ) + def clean_thermal_data_dpu(self): + for dpu in self._dpu_list: + self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) + + def thermal_data_dpu_clear(self, dpu_index): + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear(dpu_index) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear(dpu_index) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear(dpu_index) + def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None @@ -157,10 +250,60 @@ def update_single_module(self, sfp): ERROR_READ_THERMAL_DATA ) + def get_dpu_temperature_data_from_dict_obj(self, dpu_component_temperature_data, field_name): + value = dpu_component_temperature_data.get(field_name) + fault_state = False + if not value: + fault_state = True + return 0, fault_state + try: + int_value = int(float(value)) + except ValueError: + logger.log_error(f"Unable to obtain temperature data for DPU {field_name}: {value}") + int_value = 0 + fault_state = True + return int_value, fault_state + + def get_dpu_component_temperature_data(self, dpu_temperature_data, component_name): + dpu_component_temperature_data = dpu_temperature_data.get(component_name, {}) + output_dict = {} + output_false_state = False + for value in [TEMPERATURE_DATA, HIGH_THRESH, CRIT_THRESH]: + output_dict[value], fault_state = self.get_dpu_temperature_data_from_dict_obj(dpu_component_temperature_data, value) + output_false_state = output_false_state or fault_state + return output_dict[TEMPERATURE_DATA], output_dict[HIGH_THRESH], output_dict[CRIT_THRESH], ERROR_READ_THERMAL_DATA if output_false_state else 0 + + def update_dpu_temperature(self, dpu, fault_state=False): + dpu_temperature_data = dpu.get_temperature_dict() if not fault_state else {} + for key, func in dpu_func_dict.items(): + temp_data, temp_thresh, temp_crit_thresh, fault_val = self.get_dpu_component_temperature_data(dpu_temperature_data, key) + return_val = func(dpu.get_hw_mgmt_id(), temp_data, temp_thresh, temp_crit_thresh, fault_val) + if not return_val: + logger.log_error(f"Unable to update Temperature data to hw-mgmt for {key} for {dpu.get_name()}") + + def update_single_dpu(self, dpu): + try: + dpu_oper_status = dpu.get_oper_status() + pre_oper_status = self._dpu_status.get(dpu.get_name()) + if dpu_oper_status == DPU_STATUS_ONLINE: + self.update_dpu_temperature(dpu) + else: + if pre_oper_status != dpu_oper_status: + self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) + if pre_oper_status != dpu_oper_status: + self._dpu_status[dpu.get_name()] = dpu_oper_status + except Exception as e: + logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}') + self.update_dpu_temperature(dpu, fault_state=True) + def update_module(self): for sfp in self._sfp_list: self.update_single_module(sfp) + def update_dpu(self): + for dpu in self._dpu_list: + self.update_single_dpu(dpu) + def update_asic(self): try: asic_temp = self.get_asic_temp() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index c135395c363b..f26bef353a5c 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -15,13 +15,13 @@ # limitations under the License. # -import time from unittest import mock +import copy from sonic_platform import utils -from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update +from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update, hw_management_dpu_thermal_update from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ - ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD + ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD, ERROR_READ_THERMAL_DATA mock_tc_config = """ @@ -40,6 +40,10 @@ "val_min": 60000, "val_max": 80000, "poll_time": 20 + }, + "dpu\\\\d+_module": { + "child_sensors_list": ["cx_amb", "voltmon1", "voltmon2"], + "poll_time": 24 } } } @@ -109,3 +113,190 @@ def test_update_module(self): mock_sfp.get_presence = mock.MagicMock(return_value=False) updater.update_module() hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) + + # Smartswitch specific tests + def test_load_tc_config_non_exists_dpu(self): + dpu = mock.MagicMock() + updater = ThermalUpdater(None, dpu_list=[dpu]) + updater.load_tc_config() + # 3 events - ASIC, DPU, sfp + assert updater._timer._timestamp_queue.qsize() == 3 + updater = ThermalUpdater(None, dpu_list=[dpu], is_host_mgmt_mode=False) + updater.load_tc_config() + # 1 event - DPU + assert updater._timer._timestamp_queue.qsize() == 1 + + def test_load_tc_config_mocked_dpu(self): + dpu = mock.MagicMock() + updater = ThermalUpdater(None, dpu_list=[dpu]) + mock_os_open = mock.mock_open(read_data=mock_tc_config) + with mock.patch('sonic_platform.utils.open', mock_os_open): + updater.load_tc_config() + assert updater._timer._timestamp_queue.qsize() == 3 + + @mock.patch('sonic_platform.utils.write_file') + def test_configuration(self, mock_write): + dpu = mock.MagicMock() + mock_sfp = mock.MagicMock() + mock_sfp.sdk_index = 1 + self.reset_hw_mgmt_mocks() + mock_os_open = mock.mock_open(read_data=mock_tc_config) + updater = ThermalUpdater([mock_sfp], dpu_list=[dpu]) + """ Expectation on start - Clean is called for sfp, asic, DPU + suspend -> 1 and load config for all 3 along with start of timer""" + updater._timer = mock.MagicMock() + mock_os_open = mock.mock_open(read_data=mock_tc_config) + with mock.patch('sonic_platform.utils.open', mock_os_open): + updater.start() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_independent_mode_update.thermal_data_clean_asic.assert_called_once() + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) + assert updater._timer.schedule.call_count == 3 + # Called for DPU with time 24/2 = 12 + assert updater._timer.schedule.call_args_list[-1][0][0] == 12 + # Expectation on stop - timer stop and suspend = 1 + mock_write.reset_mock() + updater.stop() + updater._timer.stop.assert_called_once() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) + mock_write.reset_mock() + self.reset_hw_mgmt_mocks() + updater = ThermalUpdater(None, dpu_list=[dpu], is_host_mgmt_mode=False) + """ Expectation on start - Clean is called for DPU + load config for DPU along with start of timer""" + updater._timer = mock.MagicMock() + updater.start() + mock_write.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_independent_mode_update.thermal_data_clean_asic.assert_not_called() + hw_management_independent_mode_update.thermal_data_clean_module.assert_not_called() + # Expectation on stop - timer stop + updater.stop() + updater._timer.stop.assert_called_once() + mock_write.assert_not_called() + + def test_update_dpu(self): + self.reset_hw_mgmt_mocks() + mock_dpu = mock.MagicMock() + mock_dpu.get_hw_mgmt_id = mock.MagicMock(return_value=1) + mock_dpu.get_name = mock.MagicMock(return_value="DPU0") + mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") + temp_data = { + "DDR": {'temperature': '75.0', 'high_threshold': '95', 'critical_high_threshold': '100'}, + "CPU": {'temperature': '82.0', 'high_threshold': '90', 'critical_high_threshold': '100'}, + "NVME": {'temperature': '91', 'high_threshold': '95', 'critical_high_threshold': '98'} + } + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) + updater = ThermalUpdater(sfp_list=None, dpu_list=[mock_dpu], is_host_mgmt_mode=False) + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) + mock_dpu.get_temperature_dict = mock.MagicMock(return_value={}) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + func_dict = { + "DDR": hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, + "CPU": hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, + "NVME": hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, + } + for value in ["DDR", "CPU", "NVME"]: + temp_data_without_entry = copy.deepcopy(temp_data) + # One of the values in DDR, CPU and NVME is set to empty + temp_data_without_entry[value] = {} + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_without_entry) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + for key, func in func_dict.items(): + if key == value: + func.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + else: + func.assert_called_once_with( + 1, + int(float(temp_data[key]['temperature'])), + int(float(temp_data[key]['high_threshold'])), + int(float(temp_data[key]['critical_high_threshold'])), + 0) + # One of the values in DDR, CPU and NVME is set to a string, can not convert to integer + for field in ["temperature", "high_threshold", "critical_high_threshold"]: + temp_data_invalid = copy.deepcopy(temp_data) + temp_data_orig = copy.deepcopy(temp_data) + temp_data_invalid[value][field] = "N/A" + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_invalid) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + for key, func in func_dict.items(): + temp_data_orig[value][field] = 0 + func.assert_called_once_with( + 1, + int(float(temp_data_orig[key]['temperature'])), + int(float(temp_data_orig[key]['high_threshold'])), + int(float(temp_data_orig[key]['critical_high_threshold'])), + ERROR_READ_THERMAL_DATA if value == key else 0) + self.reset_hw_mgmt_mocks() + mock_dpu.get_oper_status = mock.MagicMock(return_value="Offline") + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_not_called() + # Clear is called only once + updater.update_dpu() + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) + self.reset_hw_mgmt_mocks() + mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) + # Multiple dpus + mock_dpu1 = mock.MagicMock() + mock_dpu1.get_hw_mgmt_id = mock.MagicMock(return_value=2) + mock_dpu1.get_name = mock.MagicMock(return_value="DPU1") + mock_dpu1.get_oper_status = mock.MagicMock(return_value="Online") + temp_data_1 = copy.deepcopy(temp_data) + temp_data_1["DDR"]["temperature"] = "52.0" + temp_data_1["CPU"]["temperature"] = "20.0" + temp_data_1["NVME"]["temperature"] = "100.0" + mock_dpu1.get_temperature_dict = mock.MagicMock(return_value=temp_data_1) + updater = ThermalUpdater(sfp_list=None, dpu_list=[mock_dpu, mock_dpu1], is_host_mgmt_mode=False) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_args_list \ + == [mock.call(1, 75, 95, 100, 0), mock.call(2, 52, 95, 100, 0)] + assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_args_list \ + == [mock.call(1, 82, 90, 100, 0), mock.call(2, 20, 90, 100, 0)] + assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_args_list \ + == [mock.call(1, 91, 95, 98, 0), mock.call(2, 100, 95, 98, 0)] + + def reset_hw_mgmt_mocks(self): + hw_management_independent_mode_update.reset_mock() + hw_management_independent_mode_update.thermal_data_clean_module.reset_mock() + hw_management_independent_mode_update.thermal_data_clean_asic.reset_mock() + hw_management_independent_mode_update.module_data_set_module_counter.reset_mock() + hw_management_independent_mode_update.thermal_data_set_asic.reset_mock() + hw_management_independent_mode_update.thermal_data_set_module.reset_mock() + hw_management_dpu_thermal_update.reset_mock() + hw_management_dpu_thermal_update.thermal_data_clean_module.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.reset_mock() From 05e2ddcfdf04c52e1fec30c5a34bc1dc6c45735c Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Thu, 21 Nov 2024 18:05:31 +0000 Subject: [PATCH 02/10] Test for thermal manager --- .../tests/test_thermal_manager.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py new file mode 100644 index 000000000000..a3a2994fc25b --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py @@ -0,0 +1,65 @@ +# +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from unittest import mock +from sonic_platform.thermal_manager import ThermalManager + + +class TestThermalManager: + + @mock.patch('sonic_platform.chassis.Chassis.chassis_instance', new_callable=mock.MagicMock) + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_module_host_management_mode') + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_json_data') + def test_updater_init(self, mock_dpus_data, mock_management_mode, mock_chassis_instance): + mock_dpus_data.return_value = {} + mock_management_mode.return_value = True + sfp_mock = mock.MagicMock() + mod_mock = mock.MagicMock() + mock_chassis_instance.get_all_sfps = sfp_mock + mock_chassis_instance.get_all_modules = mod_mock + sfp_mock.return_value = ['sfp1', 'sfp2'] + mod_mock.return_value = ['dpu1', 'dpu2'] + + with mock.patch('sonic_platform.thermal_updater.ThermalUpdater') as mock_thermal: + # Host mgmt mode, no DPUs are used for init + mgr = ThermalManager() + mgr.initialize() + mock_thermal.assert_called_once_with(['sfp1', 'sfp2'], [], True) + mgr.deinitialize() + mgr.thermal_updater_task.stop.assert_called_once() + # Not initialized if no DPUs and not in host mgmt mode + mock_management_mode.return_value = False + mock_thermal.reset_mock() + mgr.initialize() + mock_thermal.assert_not_called() + mgr.deinitialize() + mgr.thermal_updater_task.stop.assert_not_called() + # Initialized with DPUs if DPUs are present + mock_dpus_data.return_value = {'DPUS': 'dpu1'} + mock_thermal.reset_mock() + mgr.initialize() + mock_thermal.assert_called_once_with([], ['dpu1', 'dpu2'], False) + mgr.deinitialize() + mgr.thermal_updater_task.stop.assert_called_once() + # Host mgmt mode, with DPUS + mock_thermal.reset_mock() + mock_management_mode.return_value = True + mgr.initialize() + mock_thermal.assert_called_once_with(['sfp1', 'sfp2'], ['dpu1', 'dpu2'], True) + mgr.deinitialize() + mgr.thermal_updater_task.stop.assert_called_once() From 5888a5c69f19480500220d0e9afbd4929b4b332a Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 20:13:49 +0000 Subject: [PATCH 03/10] Moved smart switch content to different class --- .../smartswitch_thermal_updater.py | 182 +++++++++++++++ .../sonic_platform/thermal_manager.py | 26 +-- .../sonic_platform/thermal_updater.py | 161 +------------ .../test_smartswsitch_thermal_updater.py | 221 ++++++++++++++++++ .../tests/test_thermal_manager.py | 14 +- .../tests/test_thermal_updater.py | 199 +--------------- 6 files changed, 437 insertions(+), 366 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py new file mode 100644 index 000000000000..ffc137f57c9b --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py @@ -0,0 +1,182 @@ +# +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from . import utils +from .thermal_updater import ThermalUpdater +from sonic_py_common import logger + +import sys + +sys.path.append('/run/hw-management/bin') + +try: + import hw_management_dpu_thermal_update +except ImportError: + # For unit test and for non-smartswitch systems, these functions should not be called + from unittest import mock + hw_management_dpu_thermal_update = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear = mock.MagicMock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear = mock.MagicMock() + +SFP_TEMPERATURE_SCALE = 1000 +ASIC_TEMPERATURE_SCALE = 125 +ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000 +ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000 +CRIT_THRESH = "critical_high_threshold" +HIGH_THRESH = "high_threshold" +TEMPERATURE_DATA = "temperature" +DPU_STATUS_OFFLINE = "Offline" +DPU_STATUS_ONLINE = "Online" +CPU_FIELD = "CPU" +NVME_FIELD = "NVME" +DDR_FIELD = "DDR" +dpu_func_dict = { + CPU_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, + NVME_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, + DDR_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, + } + +ERROR_READ_THERMAL_DATA = 254000 + +TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json' +logger = logger.Logger('smart-switch-thermal-updater') + + +class SmartswitchThermalUpdater(ThermalUpdater): + def __init__(self, sfp_list, dpu_list=[], is_host_mgmt_mode=True): + super().__init__(sfp_list=sfp_list) + self._sfp_list = sfp_list + self._sfp_status = {} + # Use single timer attribute + self._timer = utils.Timer() + self._dpu_list = dpu_list + self.configure_functions(is_host_mgmt_mode) + self._dpu_status = {} + + def configure_functions(self, independent_mode): + self.start = self.start_no_independent_mode + self.stop = self.stop_no_independent_mode + self.load_tc_config = self.load_tc_config_dpu + self.clean_thermal_data = self.clean_thermal_data_dpu + if independent_mode: + self.clean_thermal_data = self.clean_all + self.load_tc_config = self.load_tc_config_all + self.start = self.start_independent_mode + self.stop = self.stop_independent_mode + + def load_tc_config_dpu(self): + dpu_poll_interval = 3 + data = utils.load_json_file(TC_CONFIG_FILE, log_func=None) + if data: + dev_parameters = data.get('dev_parameters', {}) + dpu_parameter = dev_parameters.get('dpu\\d+_module', {}) + dpu_poll_interval_config = dpu_parameter.get('poll_time') + dpu_poll_interval = int(dpu_poll_interval_config) / 2 if dpu_poll_interval_config else dpu_poll_interval + else: + logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') + logger.log_notice(f'DPU polling interval: {dpu_poll_interval}') + self._timer.schedule(dpu_poll_interval, self.update_dpu) + + def load_tc_config_all(self): + super().load_tc_config() + self.load_tc_config_dpu() + + def start_independent_mode(self): + self.clean_thermal_data() + super().control_tc(False) + self.load_tc_config() + self._timer.start() + + def start_no_independent_mode(self): + self.clean_thermal_data() + self.load_tc_config() + self._timer.start() + + def stop_independent_mode(self): + self._timer.stop() + super().control_tc(True) + + def stop_no_independent_mode(self): + self._timer.stop() + + def clean_all(self): + super().clean_thermal_data() + self.clean_thermal_data_dpu() + + def clean_thermal_data_dpu(self): + for dpu in self._dpu_list: + self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) + + def thermal_data_dpu_clear(self, dpu_index): + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear(dpu_index) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear(dpu_index) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear(dpu_index) + + def get_dpu_temperature_data_from_dict_obj(self, dpu_component_temperature_data, field_name): + value = dpu_component_temperature_data.get(field_name) + fault_state = False + if not value: + fault_state = True + return 0, fault_state + try: + int_value = int(float(value)) + except ValueError: + logger.log_error(f"Unable to obtain temperature data for DPU {field_name}: {value}") + int_value = 0 + fault_state = True + return int_value, fault_state + + def get_dpu_component_temperature_data(self, dpu_temperature_data, component_name): + dpu_component_temperature_data = dpu_temperature_data.get(component_name, {}) + output_dict = {} + output_false_state = False + for value in [TEMPERATURE_DATA, HIGH_THRESH, CRIT_THRESH]: + output_dict[value], fault_state = self.get_dpu_temperature_data_from_dict_obj(dpu_component_temperature_data, value) + output_false_state = output_false_state or fault_state + return output_dict[TEMPERATURE_DATA], output_dict[HIGH_THRESH], output_dict[CRIT_THRESH], ERROR_READ_THERMAL_DATA if output_false_state else 0 + + def update_dpu_temperature(self, dpu, fault_state=False): + dpu_temperature_data = dpu.get_temperature_dict() if not fault_state else {} + print(f"{dpu_temperature_data} is the data and {fault_state}") + for key, func in dpu_func_dict.items(): + temp_data, temp_thresh, temp_crit_thresh, fault_val = self.get_dpu_component_temperature_data(dpu_temperature_data, key) + return_val = func(dpu.get_hw_mgmt_id(), temp_data, temp_thresh, temp_crit_thresh, fault_val) + if not return_val: + logger.log_error(f"Unable to update Temperature data to hw-mgmt for {key} for {dpu.get_name()}") + + def update_single_dpu(self, dpu): + try: + dpu_oper_status = dpu.get_oper_status() + pre_oper_status = self._dpu_status.get(dpu.get_name()) + if dpu_oper_status == DPU_STATUS_ONLINE: + self.update_dpu_temperature(dpu) + else: + if pre_oper_status != dpu_oper_status: + self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) + if pre_oper_status != dpu_oper_status: + self._dpu_status[dpu.get_name()] = dpu_oper_status + except Exception as e: + logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}') + self.update_dpu_temperature(dpu, fault_state=True) + + def update_dpu(self): + for dpu in self._dpu_list: + self.update_single_dpu(dpu) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index a8c7a768f8d0..4f800440c917 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -15,7 +15,8 @@ # limitations under the License. # from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase -from . import thermal_updater +from . import thermal_updater +from . import smartswitch_thermal_updater from .device_data import DeviceDataManager @@ -35,21 +36,20 @@ def initialize(cls): :return: """ cls.thermal_updater_req = False - sfps = [] dpus = [] - host_mgmt_mode = False - if DeviceDataManager.is_module_host_management_mode(): + dpus_present = DeviceDataManager.get_platform_dpus_data() + host_mgmt_mode = DeviceDataManager.is_module_host_management_mode() + if not dpus_present and host_mgmt_mode: + # Non smart switch behaviour has highest priority from .chassis import Chassis - sfps = Chassis.chassis_instance.get_all_sfps() - cls.thermal_updater_req = True - host_mgmt_mode = True - if DeviceDataManager.get_platform_dpus_data(): - # If DPUs are present then this if condition is reached + cls.thermal_updater_task = thermal_updater.ThermalUpdater(sfp_list=Chassis.chassis_instance.get_all_sfps()) + elif dpus_present: from .chassis import Chassis dpus = Chassis.chassis_instance.get_all_modules() - cls.thermal_updater_req = True - if cls.thermal_updater_req: - cls.thermal_updater_task = thermal_updater.ThermalUpdater(sfps, dpus, host_mgmt_mode) + cls.thermal_updater_task = smartswitch_thermal_updater.SmartswitchThermalUpdater(sfp_list=Chassis.chassis_instance.get_all_sfps(), + dpu_list=dpus, + is_host_mgmt_mode=host_mgmt_mode) + if cls.thermal_updater_task: cls.thermal_updater_task.start() @classmethod @@ -59,5 +59,5 @@ def deinitialize(cls): is a no-op. :return: """ - if cls.thermal_updater_req and cls.thermal_updater_task: + if cls.thermal_updater_task: cls.thermal_updater_task.stop() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index 6bb2cb4d52e7..ba4f0503cb4d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -19,6 +19,7 @@ from sonic_py_common import logger import sys +import time sys.path.append('/run/hw-management/bin') @@ -34,36 +35,11 @@ hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock() hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock() -try: - import hw_management_dpu_thermal_update -except ImportError: - # For unit test and for non-smartswitch systems, these functions should not be called - from unittest import mock - hw_management_dpu_thermal_update = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear = mock.MagicMock() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear = mock.MagicMock() SFP_TEMPERATURE_SCALE = 1000 ASIC_TEMPERATURE_SCALE = 125 ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000 ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000 -CRIT_THRESH = "critical_high_threshold" -HIGH_THRESH = "high_threshold" -TEMPERATURE_DATA = "temperature" -DPU_STATUS_OFFLINE = "Offline" -DPU_STATUS_ONLINE = "Online" -CPU_FIELD = "CPU" -NVME_FIELD = "NVME" -DDR_FIELD = "DDR" -dpu_func_dict = { - CPU_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, - NVME_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, - DDR_FIELD: hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, - } ERROR_READ_THERMAL_DATA = 254000 @@ -72,50 +48,17 @@ class ThermalUpdater: - def __init__(self, sfp_list, dpu_list=[], is_host_mgmt_mode=True): - # Default initialization is in host mgmt mode without dpus + def __init__(self, sfp_list): self._sfp_list = sfp_list self._sfp_status = {} self._timer = utils.Timer() - self._dpu_list = dpu_list - self._dpu_status = dpu_list - self.dpus_exist = False - if len(self._dpu_list) > 0: - self.dpus_exist = True - self._dpu_status = {} - self.dev_parameters = None - self.data = None - self.read_checked = False - self.configure_functions(self.dpus_exist, is_host_mgmt_mode) - - def configure_functions(self, dpu, independent_mode): - self.start = self.start_independent_mode - self.stop = self.stop_independent_mode - self.load_tc_config = self.load_tc_config_asic_sfp - self.clean_thermal_data = self.clean_thermal_data_asic_sfp - if dpu: - self.clean_thermal_data = self.clean_all - self.load_tc_config = self.load_tc_config_all - if not independent_mode: - self.start = self.start_no_independent - self.stop = self.stop_no_independent_mode - self.load_tc_config = self.load_tc_config_dpu - self.clean_thermal_data = self.clean_thermal_data_dpu - def read_tc_config_data(self): - if self.read_checked: - return self.data + def load_tc_config(self): + asic_poll_interval = 1 + sfp_poll_interval = 10 data = utils.load_json_file(TC_CONFIG_FILE, log_func=None) if not data: logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') - self.data = data - self.read_checked = True - return self.data - - def load_tc_config_asic_sfp(self): - asic_poll_interval = 1 - sfp_poll_interval = 10 - data = self.read_tc_config_data() if data: dev_parameters = data.get('dev_parameters') @@ -136,48 +79,21 @@ def load_tc_config_asic_sfp(self): logger.log_notice(f'Module polling interval: {sfp_poll_interval}') self._timer.schedule(sfp_poll_interval, self.update_module) - def load_tc_config_dpu(self): - dpu_poll_interval = 3 - data = self.read_tc_config_data() - if data: - dev_parameters = data.get('dev_parameters', {}) - dpu_parameter = dev_parameters.get('dpu\\d+_module', {}) - dpu_poll_interval_config = dpu_parameter.get('poll_time') - dpu_poll_interval = int(dpu_poll_interval_config) / 2 if dpu_poll_interval_config else dpu_poll_interval - logger.log_notice(f'DPU polling interval: {dpu_poll_interval}') - self._timer.schedule(dpu_poll_interval, self.update_dpu) - - def load_tc_config_all(self): - self.load_tc_config_asic_sfp() - self.load_tc_config_dpu() - - def start_independent_mode(self): + def start(self): self.clean_thermal_data() self.control_tc(False) self.load_tc_config() self._timer.start() - def start_no_independent(self): - self.clean_thermal_data() - self.load_tc_config() - self._timer.start() - - def stop_independent_mode(self): + def stop(self): self._timer.stop() self.control_tc(True) - def stop_no_independent_mode(self): - self._timer.stop() - def control_tc(self, suspend): logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}') utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0) - def clean_all(self): - self.clean_thermal_data_asic_sfp() - self.clean_thermal_data_dpu() - - def clean_thermal_data_asic_sfp(self): + def clean_thermal_data(self): hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list)) hw_management_independent_mode_update.thermal_data_clean_asic(0) for sfp in self._sfp_list: @@ -186,15 +102,6 @@ def clean_thermal_data_asic_sfp(self): sfp.sdk_index + 1 ) - def clean_thermal_data_dpu(self): - for dpu in self._dpu_list: - self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) - - def thermal_data_dpu_clear(self, dpu_index): - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear(dpu_index) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear(dpu_index) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear(dpu_index) - def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None @@ -250,60 +157,10 @@ def update_single_module(self, sfp): ERROR_READ_THERMAL_DATA ) - def get_dpu_temperature_data_from_dict_obj(self, dpu_component_temperature_data, field_name): - value = dpu_component_temperature_data.get(field_name) - fault_state = False - if not value: - fault_state = True - return 0, fault_state - try: - int_value = int(float(value)) - except ValueError: - logger.log_error(f"Unable to obtain temperature data for DPU {field_name}: {value}") - int_value = 0 - fault_state = True - return int_value, fault_state - - def get_dpu_component_temperature_data(self, dpu_temperature_data, component_name): - dpu_component_temperature_data = dpu_temperature_data.get(component_name, {}) - output_dict = {} - output_false_state = False - for value in [TEMPERATURE_DATA, HIGH_THRESH, CRIT_THRESH]: - output_dict[value], fault_state = self.get_dpu_temperature_data_from_dict_obj(dpu_component_temperature_data, value) - output_false_state = output_false_state or fault_state - return output_dict[TEMPERATURE_DATA], output_dict[HIGH_THRESH], output_dict[CRIT_THRESH], ERROR_READ_THERMAL_DATA if output_false_state else 0 - - def update_dpu_temperature(self, dpu, fault_state=False): - dpu_temperature_data = dpu.get_temperature_dict() if not fault_state else {} - for key, func in dpu_func_dict.items(): - temp_data, temp_thresh, temp_crit_thresh, fault_val = self.get_dpu_component_temperature_data(dpu_temperature_data, key) - return_val = func(dpu.get_hw_mgmt_id(), temp_data, temp_thresh, temp_crit_thresh, fault_val) - if not return_val: - logger.log_error(f"Unable to update Temperature data to hw-mgmt for {key} for {dpu.get_name()}") - - def update_single_dpu(self, dpu): - try: - dpu_oper_status = dpu.get_oper_status() - pre_oper_status = self._dpu_status.get(dpu.get_name()) - if dpu_oper_status == DPU_STATUS_ONLINE: - self.update_dpu_temperature(dpu) - else: - if pre_oper_status != dpu_oper_status: - self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) - if pre_oper_status != dpu_oper_status: - self._dpu_status[dpu.get_name()] = dpu_oper_status - except Exception as e: - logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}') - self.update_dpu_temperature(dpu, fault_state=True) - def update_module(self): for sfp in self._sfp_list: self.update_single_module(sfp) - def update_dpu(self): - for dpu in self._dpu_list: - self.update_single_dpu(dpu) - def update_asic(self): try: asic_temp = self.get_asic_temp() @@ -330,4 +187,4 @@ def update_asic(self): 0, 0, ERROR_READ_THERMAL_DATA - ) + ) \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py new file mode 100644 index 000000000000..6aeb4ba5859c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py @@ -0,0 +1,221 @@ +# +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from unittest import mock +import copy + +from sonic_platform import utils +from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update +from sonic_platform.smartswitch_thermal_updater import SmartswitchThermalUpdater, hw_management_dpu_thermal_update +from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ + ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD, ERROR_READ_THERMAL_DATA + + +mock_tc_config = """ +{ + "dev_parameters": { + "asic": { + "pwm_min": 20, + "pwm_max": 100, + "val_min": "!70000", + "val_max": "!105000", + "poll_time": 3 + }, + "module\\\\d+": { + "pwm_min": 20, + "pwm_max": 100, + "val_min": 60000, + "val_max": 80000, + "poll_time": 20 + }, + "dpu\\\\d+_module": { + "child_sensors_list": ["cx_amb", "voltmon1", "voltmon2"], + "poll_time": 24 + } + } +} +""" + + +class TestSmartSwitchThermalUpdater: + @mock.patch('sonic_platform.utils.write_file') + def test_configuration(self, mock_write): + dpu = mock.MagicMock() + mock_sfp = mock.MagicMock() + mock_sfp.sdk_index = 1 + self.reset_hw_mgmt_mocks() + mock_os_open = mock.mock_open(read_data=mock_tc_config) + updater = SmartswitchThermalUpdater([mock_sfp], dpu_list=[dpu]) + """ Expectation on start - Clean is called for sfp, asic, DPU + suspend -> 1 and load config for all 3 along with start of timer""" + updater._timer = mock.MagicMock() + mock_os_open = mock.mock_open(read_data=mock_tc_config) + with mock.patch('sonic_platform.utils.open', mock_os_open): + updater.start() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_independent_mode_update.thermal_data_clean_asic.assert_called_once() + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) + assert updater._timer.schedule.call_count == 3 + # Called for DPU with time 24/2 = 12 + assert updater._timer.schedule.call_args_list[-1][0][0] == 12 + # Expectation on stop - timer stop and suspend = 1 + mock_write.reset_mock() + updater.stop() + updater._timer.stop.assert_called_once() + mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) + mock_write.reset_mock() + self.reset_hw_mgmt_mocks() + updater = SmartswitchThermalUpdater(None, dpu_list=[dpu], is_host_mgmt_mode=False) + """ Expectation on start - Clean is called for DPU + load config for DPU along with start of timer""" + updater._timer = mock.MagicMock() + updater.start() + mock_write.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) + hw_management_independent_mode_update.thermal_data_clean_asic.assert_not_called() + hw_management_independent_mode_update.thermal_data_clean_module.assert_not_called() + # Expectation on stop - timer stop + updater.stop() + updater._timer.stop.assert_called_once() + mock_write.assert_not_called() + + def test_update_dpu(self): + self.reset_hw_mgmt_mocks() + mock_dpu = mock.MagicMock() + mock_dpu.get_hw_mgmt_id = mock.MagicMock(return_value=1) + mock_dpu.get_name = mock.MagicMock(return_value="DPU0") + mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") + temp_data = { + "DDR": {'temperature': '75.0', 'high_threshold': '95', 'critical_high_threshold': '100'}, + "CPU": {'temperature': '82.0', 'high_threshold': '90', 'critical_high_threshold': '100'}, + "NVME": {'temperature': '91', 'high_threshold': '95', 'critical_high_threshold': '98'} + } + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) + print(f"{mock_dpu.get_temperature_dict()}") + updater = SmartswitchThermalUpdater(sfp_list=None, dpu_list=[mock_dpu], is_host_mgmt_mode=False) + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) + mock_dpu.get_temperature_dict = mock.MagicMock(return_value={}) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + func_dict = { + "DDR": hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, + "CPU": hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, + "NVME": hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, + } + for value in ["DDR", "CPU", "NVME"]: + temp_data_without_entry = copy.deepcopy(temp_data) + # One of the values in DDR, CPU and NVME is set to empty + temp_data_without_entry[value] = {} + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_without_entry) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + for key, func in func_dict.items(): + if key == value: + func.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) + else: + func.assert_called_once_with( + 1, + int(float(temp_data[key]['temperature'])), + int(float(temp_data[key]['high_threshold'])), + int(float(temp_data[key]['critical_high_threshold'])), + 0) + # One of the values in DDR, CPU and NVME is set to a string, can not convert to integer + for field in ["temperature", "high_threshold", "critical_high_threshold"]: + temp_data_invalid = copy.deepcopy(temp_data) + temp_data_orig = copy.deepcopy(temp_data) + temp_data_invalid[value][field] = "N/A" + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_invalid) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + for key, func in func_dict.items(): + temp_data_orig[value][field] = 0 + func.assert_called_once_with( + 1, + int(float(temp_data_orig[key]['temperature'])), + int(float(temp_data_orig[key]['high_threshold'])), + int(float(temp_data_orig[key]['critical_high_threshold'])), + ERROR_READ_THERMAL_DATA if value == key else 0) + self.reset_hw_mgmt_mocks() + mock_dpu.get_oper_status = mock.MagicMock(return_value="Offline") + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_not_called() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_not_called() + # Clear is called only once + updater.update_dpu() + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) + self.reset_hw_mgmt_mocks() + mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") + mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) + updater.update_dpu() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) + # Multiple dpus + mock_dpu1 = mock.MagicMock() + mock_dpu1.get_hw_mgmt_id = mock.MagicMock(return_value=2) + mock_dpu1.get_name = mock.MagicMock(return_value="DPU1") + mock_dpu1.get_oper_status = mock.MagicMock(return_value="Online") + temp_data_1 = copy.deepcopy(temp_data) + temp_data_1["DDR"]["temperature"] = "52.0" + temp_data_1["CPU"]["temperature"] = "20.0" + temp_data_1["NVME"]["temperature"] = "100.0" + mock_dpu1.get_temperature_dict = mock.MagicMock(return_value=temp_data_1) + updater = SmartswitchThermalUpdater(sfp_list=None, dpu_list=[mock_dpu, mock_dpu1], is_host_mgmt_mode=False) + self.reset_hw_mgmt_mocks() + updater.update_dpu() + assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_count == 2 + assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_args_list \ + == [mock.call(1, 75, 95, 100, 0), mock.call(2, 52, 95, 100, 0)] + assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_args_list \ + == [mock.call(1, 82, 90, 100, 0), mock.call(2, 20, 90, 100, 0)] + assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_args_list \ + == [mock.call(1, 91, 95, 98, 0), mock.call(2, 100, 95, 98, 0)] + + def reset_hw_mgmt_mocks(self): + hw_management_independent_mode_update.reset_mock() + hw_management_independent_mode_update.thermal_data_clean_module.reset_mock() + hw_management_independent_mode_update.thermal_data_clean_asic.reset_mock() + hw_management_independent_mode_update.module_data_set_module_counter.reset_mock() + hw_management_independent_mode_update.thermal_data_set_asic.reset_mock() + hw_management_independent_mode_update.thermal_data_set_module.reset_mock() + hw_management_dpu_thermal_update.reset_mock() + hw_management_dpu_thermal_update.thermal_data_clean_module.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.reset_mock() + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.reset_mock() \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py index a3a2994fc25b..2f39b1cd1a8b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_manager.py @@ -24,7 +24,7 @@ class TestThermalManager: @mock.patch('sonic_platform.chassis.Chassis.chassis_instance', new_callable=mock.MagicMock) @mock.patch('sonic_platform.device_data.DeviceDataManager.is_module_host_management_mode') - @mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_json_data') + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_dpus_data') def test_updater_init(self, mock_dpus_data, mock_management_mode, mock_chassis_instance): mock_dpus_data.return_value = {} mock_management_mode.return_value = True @@ -35,11 +35,12 @@ def test_updater_init(self, mock_dpus_data, mock_management_mode, mock_chassis_i sfp_mock.return_value = ['sfp1', 'sfp2'] mod_mock.return_value = ['dpu1', 'dpu2'] - with mock.patch('sonic_platform.thermal_updater.ThermalUpdater') as mock_thermal: + with mock.patch('sonic_platform.thermal_updater.ThermalUpdater') as mock_thermal, \ + mock.patch('sonic_platform.smartswitch_thermal_updater.SmartswitchThermalUpdater') as mock_sm_thermal: # Host mgmt mode, no DPUs are used for init mgr = ThermalManager() mgr.initialize() - mock_thermal.assert_called_once_with(['sfp1', 'sfp2'], [], True) + mock_thermal.assert_called_once_with(sfp_list=['sfp1', 'sfp2']) mgr.deinitialize() mgr.thermal_updater_task.stop.assert_called_once() # Not initialized if no DPUs and not in host mgmt mode @@ -48,18 +49,19 @@ def test_updater_init(self, mock_dpus_data, mock_management_mode, mock_chassis_i mgr.initialize() mock_thermal.assert_not_called() mgr.deinitialize() - mgr.thermal_updater_task.stop.assert_not_called() + mgr.thermal_updater_task.stop.assert_called_once() # Initialized with DPUs if DPUs are present mock_dpus_data.return_value = {'DPUS': 'dpu1'} mock_thermal.reset_mock() mgr.initialize() - mock_thermal.assert_called_once_with([], ['dpu1', 'dpu2'], False) + mock_sm_thermal.assert_called_once_with(sfp_list=['sfp1', 'sfp2'], dpu_list=['dpu1', 'dpu2'], is_host_mgmt_mode=False) mgr.deinitialize() mgr.thermal_updater_task.stop.assert_called_once() # Host mgmt mode, with DPUS mock_thermal.reset_mock() + mock_sm_thermal.reset_mock() mock_management_mode.return_value = True mgr.initialize() - mock_thermal.assert_called_once_with(['sfp1', 'sfp2'], ['dpu1', 'dpu2'], True) + mock_sm_thermal.assert_called_once_with(sfp_list=['sfp1', 'sfp2'], dpu_list=['dpu1', 'dpu2'], is_host_mgmt_mode=True) mgr.deinitialize() mgr.thermal_updater_task.stop.assert_called_once() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index f26bef353a5c..c05e8e54b67d 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -15,13 +15,13 @@ # limitations under the License. # +import time from unittest import mock -import copy from sonic_platform import utils -from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update, hw_management_dpu_thermal_update +from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ - ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD, ERROR_READ_THERMAL_DATA + ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD mock_tc_config = """ @@ -40,10 +40,6 @@ "val_min": 60000, "val_max": 80000, "poll_time": 20 - }, - "dpu\\\\d+_module": { - "child_sensors_list": ["cx_amb", "voltmon1", "voltmon2"], - "poll_time": 24 } } } @@ -112,191 +108,4 @@ def test_update_module(self): mock_sfp.get_presence = mock.MagicMock(return_value=False) updater.update_module() - hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) - - # Smartswitch specific tests - def test_load_tc_config_non_exists_dpu(self): - dpu = mock.MagicMock() - updater = ThermalUpdater(None, dpu_list=[dpu]) - updater.load_tc_config() - # 3 events - ASIC, DPU, sfp - assert updater._timer._timestamp_queue.qsize() == 3 - updater = ThermalUpdater(None, dpu_list=[dpu], is_host_mgmt_mode=False) - updater.load_tc_config() - # 1 event - DPU - assert updater._timer._timestamp_queue.qsize() == 1 - - def test_load_tc_config_mocked_dpu(self): - dpu = mock.MagicMock() - updater = ThermalUpdater(None, dpu_list=[dpu]) - mock_os_open = mock.mock_open(read_data=mock_tc_config) - with mock.patch('sonic_platform.utils.open', mock_os_open): - updater.load_tc_config() - assert updater._timer._timestamp_queue.qsize() == 3 - - @mock.patch('sonic_platform.utils.write_file') - def test_configuration(self, mock_write): - dpu = mock.MagicMock() - mock_sfp = mock.MagicMock() - mock_sfp.sdk_index = 1 - self.reset_hw_mgmt_mocks() - mock_os_open = mock.mock_open(read_data=mock_tc_config) - updater = ThermalUpdater([mock_sfp], dpu_list=[dpu]) - """ Expectation on start - Clean is called for sfp, asic, DPU - suspend -> 1 and load config for all 3 along with start of timer""" - updater._timer = mock.MagicMock() - mock_os_open = mock.mock_open(read_data=mock_tc_config) - with mock.patch('sonic_platform.utils.open', mock_os_open): - updater.start() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_independent_mode_update.thermal_data_clean_asic.assert_called_once() - hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once() - mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) - assert updater._timer.schedule.call_count == 3 - # Called for DPU with time 24/2 = 12 - assert updater._timer.schedule.call_args_list[-1][0][0] == 12 - # Expectation on stop - timer stop and suspend = 1 - mock_write.reset_mock() - updater.stop() - updater._timer.stop.assert_called_once() - mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) - mock_write.reset_mock() - self.reset_hw_mgmt_mocks() - updater = ThermalUpdater(None, dpu_list=[dpu], is_host_mgmt_mode=False) - """ Expectation on start - Clean is called for DPU - load config for DPU along with start of timer""" - updater._timer = mock.MagicMock() - updater.start() - mock_write.assert_not_called() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_independent_mode_update.thermal_data_clean_asic.assert_not_called() - hw_management_independent_mode_update.thermal_data_clean_module.assert_not_called() - # Expectation on stop - timer stop - updater.stop() - updater._timer.stop.assert_called_once() - mock_write.assert_not_called() - - def test_update_dpu(self): - self.reset_hw_mgmt_mocks() - mock_dpu = mock.MagicMock() - mock_dpu.get_hw_mgmt_id = mock.MagicMock(return_value=1) - mock_dpu.get_name = mock.MagicMock(return_value="DPU0") - mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") - temp_data = { - "DDR": {'temperature': '75.0', 'high_threshold': '95', 'critical_high_threshold': '100'}, - "CPU": {'temperature': '82.0', 'high_threshold': '90', 'critical_high_threshold': '100'}, - "NVME": {'temperature': '91', 'high_threshold': '95', 'critical_high_threshold': '98'} - } - mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) - updater = ThermalUpdater(sfp_list=None, dpu_list=[mock_dpu], is_host_mgmt_mode=False) - updater.update_dpu() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) - mock_dpu.get_temperature_dict = mock.MagicMock(return_value={}) - self.reset_hw_mgmt_mocks() - updater.update_dpu() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) - func_dict = { - "DDR": hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set, - "CPU": hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set, - "NVME": hw_management_dpu_thermal_update.thermal_data_dpu_drive_set, - } - for value in ["DDR", "CPU", "NVME"]: - temp_data_without_entry = copy.deepcopy(temp_data) - # One of the values in DDR, CPU and NVME is set to empty - temp_data_without_entry[value] = {} - mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_without_entry) - self.reset_hw_mgmt_mocks() - updater.update_dpu() - for key, func in func_dict.items(): - if key == value: - func.assert_called_once_with(1, 0, 0, 0, ERROR_READ_THERMAL_DATA) - else: - func.assert_called_once_with( - 1, - int(float(temp_data[key]['temperature'])), - int(float(temp_data[key]['high_threshold'])), - int(float(temp_data[key]['critical_high_threshold'])), - 0) - # One of the values in DDR, CPU and NVME is set to a string, can not convert to integer - for field in ["temperature", "high_threshold", "critical_high_threshold"]: - temp_data_invalid = copy.deepcopy(temp_data) - temp_data_orig = copy.deepcopy(temp_data) - temp_data_invalid[value][field] = "N/A" - mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data_invalid) - self.reset_hw_mgmt_mocks() - updater.update_dpu() - for key, func in func_dict.items(): - temp_data_orig[value][field] = 0 - func.assert_called_once_with( - 1, - int(float(temp_data_orig[key]['temperature'])), - int(float(temp_data_orig[key]['high_threshold'])), - int(float(temp_data_orig[key]['critical_high_threshold'])), - ERROR_READ_THERMAL_DATA if value == key else 0) - self.reset_hw_mgmt_mocks() - mock_dpu.get_oper_status = mock.MagicMock(return_value="Offline") - updater.update_dpu() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_not_called() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_not_called() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_not_called() - # Clear is called only once - updater.update_dpu() - updater.update_dpu() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(1) - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(1) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(1) - self.reset_hw_mgmt_mocks() - mock_dpu.get_oper_status = mock.MagicMock(return_value="Online") - mock_dpu.get_temperature_dict = mock.MagicMock(return_value=temp_data) - updater.update_dpu() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.assert_called_once_with(1, 75, 95, 100, 0) - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.assert_called_once_with(1, 82, 90, 100, 0) - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.assert_called_once_with(1, 91, 95, 98, 0) - # Multiple dpus - mock_dpu1 = mock.MagicMock() - mock_dpu1.get_hw_mgmt_id = mock.MagicMock(return_value=2) - mock_dpu1.get_name = mock.MagicMock(return_value="DPU1") - mock_dpu1.get_oper_status = mock.MagicMock(return_value="Online") - temp_data_1 = copy.deepcopy(temp_data) - temp_data_1["DDR"]["temperature"] = "52.0" - temp_data_1["CPU"]["temperature"] = "20.0" - temp_data_1["NVME"]["temperature"] = "100.0" - mock_dpu1.get_temperature_dict = mock.MagicMock(return_value=temp_data_1) - updater = ThermalUpdater(sfp_list=None, dpu_list=[mock_dpu, mock_dpu1], is_host_mgmt_mode=False) - self.reset_hw_mgmt_mocks() - updater.update_dpu() - assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_count == 2 - assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_count == 2 - assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_count == 2 - assert hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.call_args_list \ - == [mock.call(1, 75, 95, 100, 0), mock.call(2, 52, 95, 100, 0)] - assert hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.call_args_list \ - == [mock.call(1, 82, 90, 100, 0), mock.call(2, 20, 90, 100, 0)] - assert hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.call_args_list \ - == [mock.call(1, 91, 95, 98, 0), mock.call(2, 100, 95, 98, 0)] - - def reset_hw_mgmt_mocks(self): - hw_management_independent_mode_update.reset_mock() - hw_management_independent_mode_update.thermal_data_clean_module.reset_mock() - hw_management_independent_mode_update.thermal_data_clean_asic.reset_mock() - hw_management_independent_mode_update.module_data_set_module_counter.reset_mock() - hw_management_independent_mode_update.thermal_data_set_asic.reset_mock() - hw_management_independent_mode_update.thermal_data_set_module.reset_mock() - hw_management_dpu_thermal_update.reset_mock() - hw_management_dpu_thermal_update.thermal_data_clean_module.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.reset_mock() + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) \ No newline at end of file From f6f0d656c2c763601bef946116e1475688195c74 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 20:15:43 +0000 Subject: [PATCH 04/10] Fix difference --- .../mlnx-platform-api/sonic_platform/thermal_manager.py | 2 -- .../mlnx-platform-api/sonic_platform/thermal_updater.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 4f800440c917..622404b1778a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -22,7 +22,6 @@ class ThermalManager(ThermalManagerBase): thermal_updater_task = None - thermal_updaer_req = None @classmethod def run_policy(cls, chassis): @@ -35,7 +34,6 @@ def initialize(cls): and any other vendor specific initialization. :return: """ - cls.thermal_updater_req = False dpus = [] dpus_present = DeviceDataManager.get_platform_dpus_data() host_mgmt_mode = DeviceDataManager.is_module_host_management_mode() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index ba4f0503cb4d..9a7af1b06852 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -187,4 +187,5 @@ def update_asic(self): 0, 0, ERROR_READ_THERMAL_DATA - ) \ No newline at end of file + ) + From 6fce0b5e311e6e2dcfbf617cdfc5c03281e1de6b Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 20:19:55 +0000 Subject: [PATCH 05/10] Fix headers --- .../sonic_platform/smartswitch_thermal_updater.py | 3 ++- .../tests/test_smartswsitch_thermal_updater.py | 5 +++-- .../mellanox/mlnx-platform-api/tests/test_thermal_updater.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py index ffc137f57c9b..4112877386a1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py @@ -1,5 +1,6 @@ # -# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py index 6aeb4ba5859c..18e5f24cc15a 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py @@ -1,5 +1,6 @@ # -# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -218,4 +219,4 @@ def reset_hw_mgmt_mocks(self): hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.reset_mock() hw_management_dpu_thermal_update.thermal_data_dpu_ddr_set.reset_mock() hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_set.reset_mock() - hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.reset_mock() \ No newline at end of file + hw_management_dpu_thermal_update.thermal_data_dpu_drive_set.reset_mock() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index c05e8e54b67d..61650d61606b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -108,4 +108,5 @@ def test_update_module(self): mock_sfp.get_presence = mock.MagicMock(return_value=False) updater.update_module() - hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) \ No newline at end of file + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) + From d665cff9b52d297f4807eefec0d6b603928bf87b Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 20:22:36 +0000 Subject: [PATCH 06/10] Restore files --- .../mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py | 1 - .../mellanox/mlnx-platform-api/tests/test_thermal_updater.py | 1 - 2 files changed, 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index 9a7af1b06852..889bc96d3bec 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -188,4 +188,3 @@ def update_asic(self): 0, ERROR_READ_THERMAL_DATA ) - diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index 61650d61606b..c135395c363b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -109,4 +109,3 @@ def test_update_module(self): mock_sfp.get_presence = mock.MagicMock(return_value=False) updater.update_module() hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, 11) - From 2e5f9b0cd15455346a3e5bc64eb182e6a1556398 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 21:08:51 +0000 Subject: [PATCH 07/10] Updated configuratoin implementation --- .../smartswitch_thermal_updater.py | 55 ++++--------------- 1 file changed, 12 insertions(+), 43 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py index 4112877386a1..16ca435d3e49 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py @@ -37,10 +37,6 @@ hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear = mock.MagicMock() hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear = mock.MagicMock() -SFP_TEMPERATURE_SCALE = 1000 -ASIC_TEMPERATURE_SCALE = 125 -ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000 -ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000 CRIT_THRESH = "critical_high_threshold" HIGH_THRESH = "high_threshold" TEMPERATURE_DATA = "temperature" @@ -64,24 +60,9 @@ class SmartswitchThermalUpdater(ThermalUpdater): def __init__(self, sfp_list, dpu_list=[], is_host_mgmt_mode=True): super().__init__(sfp_list=sfp_list) - self._sfp_list = sfp_list - self._sfp_status = {} - # Use single timer attribute - self._timer = utils.Timer() self._dpu_list = dpu_list - self.configure_functions(is_host_mgmt_mode) self._dpu_status = {} - - def configure_functions(self, independent_mode): - self.start = self.start_no_independent_mode - self.stop = self.stop_no_independent_mode - self.load_tc_config = self.load_tc_config_dpu - self.clean_thermal_data = self.clean_thermal_data_dpu - if independent_mode: - self.clean_thermal_data = self.clean_all - self.load_tc_config = self.load_tc_config_all - self.start = self.start_independent_mode - self.stop = self.stop_independent_mode + self.host_mgmt_mode = is_host_mgmt_mode def load_tc_config_dpu(self): dpu_poll_interval = 3 @@ -96,31 +77,19 @@ def load_tc_config_dpu(self): logger.log_notice(f'DPU polling interval: {dpu_poll_interval}') self._timer.schedule(dpu_poll_interval, self.update_dpu) - def load_tc_config_all(self): - super().load_tc_config() + def start(self): + self.thermal_data_dpu_clear() self.load_tc_config_dpu() + if self.host_mgmt_mode: + super().start() + else: + self._timer.start() - def start_independent_mode(self): - self.clean_thermal_data() - super().control_tc(False) - self.load_tc_config() - self._timer.start() - - def start_no_independent_mode(self): - self.clean_thermal_data() - self.load_tc_config() - self._timer.start() - - def stop_independent_mode(self): - self._timer.stop() - super().control_tc(True) - - def stop_no_independent_mode(self): - self._timer.stop() - - def clean_all(self): - super().clean_thermal_data() - self.clean_thermal_data_dpu() + def stop(self): + if self.host_mgmt_mode: + super().stop() + else: + self._timer.stop() def clean_thermal_data_dpu(self): for dpu in self._dpu_list: From 2641deca76b8409c1aa525f2e4cce196a20d1c12 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 21:31:58 +0000 Subject: [PATCH 08/10] Test fixes --- .../sonic_platform/smartswitch_thermal_updater.py | 2 +- .../tests/test_smartswsitch_thermal_updater.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py index 16ca435d3e49..6765016921e0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py @@ -78,7 +78,7 @@ def load_tc_config_dpu(self): self._timer.schedule(dpu_poll_interval, self.update_dpu) def start(self): - self.thermal_data_dpu_clear() + self.clean_thermal_data_dpu() self.load_tc_config_dpu() if self.host_mgmt_mode: super().start() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py index 18e5f24cc15a..aee47dc6ddfd 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py @@ -75,7 +75,7 @@ def test_configuration(self, mock_write): mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) assert updater._timer.schedule.call_count == 3 # Called for DPU with time 24/2 = 12 - assert updater._timer.schedule.call_args_list[-1][0][0] == 12 + assert updater._timer.schedule.call_args_list[0][0][0] == 12 # Expectation on stop - timer stop and suspend = 1 mock_write.reset_mock() updater.stop() From 609cbe27a131915367ae9b4d48cd78e0ca20ddf3 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Mon, 25 Nov 2024 21:33:10 +0000 Subject: [PATCH 09/10] Test fixes --- .../tests/test_smartswsitch_thermal_updater.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py index aee47dc6ddfd..ce87afa6dddb 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py @@ -19,11 +19,9 @@ from unittest import mock import copy -from sonic_platform import utils -from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update +from sonic_platform.thermal_updater import hw_management_independent_mode_update from sonic_platform.smartswitch_thermal_updater import SmartswitchThermalUpdater, hw_management_dpu_thermal_update -from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ - ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD, ERROR_READ_THERMAL_DATA +from sonic_platform.thermal_updater import ERROR_READ_THERMAL_DATA mock_tc_config = """ From 3cc382b81e7747864d562cc7f33d35efeaa2b3bf Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Wed, 27 Nov 2024 17:04:19 +0000 Subject: [PATCH 10/10] Review fixes --- .../smartswitch_thermal_updater.py | 30 +++++++++---------- .../sonic_platform/thermal_manager.py | 1 - 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py index 6765016921e0..da2253e77c3e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/smartswitch_thermal_updater.py @@ -58,7 +58,7 @@ class SmartswitchThermalUpdater(ThermalUpdater): - def __init__(self, sfp_list, dpu_list=[], is_host_mgmt_mode=True): + def __init__(self, sfp_list, dpu_list, is_host_mgmt_mode=True): super().__init__(sfp_list=sfp_list) self._dpu_list = dpu_list self._dpu_status = {} @@ -125,7 +125,6 @@ def get_dpu_component_temperature_data(self, dpu_temperature_data, component_nam def update_dpu_temperature(self, dpu, fault_state=False): dpu_temperature_data = dpu.get_temperature_dict() if not fault_state else {} - print(f"{dpu_temperature_data} is the data and {fault_state}") for key, func in dpu_func_dict.items(): temp_data, temp_thresh, temp_crit_thresh, fault_val = self.get_dpu_component_temperature_data(dpu_temperature_data, key) return_val = func(dpu.get_hw_mgmt_id(), temp_data, temp_thresh, temp_crit_thresh, fault_val) @@ -133,19 +132,20 @@ def update_dpu_temperature(self, dpu, fault_state=False): logger.log_error(f"Unable to update Temperature data to hw-mgmt for {key} for {dpu.get_name()}") def update_single_dpu(self, dpu): - try: - dpu_oper_status = dpu.get_oper_status() - pre_oper_status = self._dpu_status.get(dpu.get_name()) - if dpu_oper_status == DPU_STATUS_ONLINE: - self.update_dpu_temperature(dpu) - else: - if pre_oper_status != dpu_oper_status: - self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) - if pre_oper_status != dpu_oper_status: - self._dpu_status[dpu.get_name()] = dpu_oper_status - except Exception as e: - logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}') - self.update_dpu_temperature(dpu, fault_state=True) + try: + dpu_oper_status = dpu.get_oper_status() + pre_oper_status = self._dpu_status.get(dpu.get_name()) + if dpu_oper_status == DPU_STATUS_ONLINE: + self.update_dpu_temperature(dpu) + elif pre_oper_status != dpu_oper_status: + # If dpu is shutdown from previous execution + self.thermal_data_dpu_clear(dpu.get_hw_mgmt_id()) + if pre_oper_status != dpu_oper_status: + # If there is a change in oper_status (irrespective of type of change) + self._dpu_status[dpu.get_name()] = dpu_oper_status + except Exception as e: + logger.log_error(f'Failed to update DPU {dpu.get_hw_mgmt_id()} thermal data - {e}') + self.update_dpu_temperature(dpu, fault_state=True) def update_dpu(self): for dpu in self._dpu_list: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 622404b1778a..813c5e8eefb0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -34,7 +34,6 @@ def initialize(cls): and any other vendor specific initialization. :return: """ - dpus = [] dpus_present = DeviceDataManager.get_platform_dpus_data() host_mgmt_mode = DeviceDataManager.is_module_host_management_mode() if not dpus_present and host_mgmt_mode: