From c161ce0153a68a7577d1f52299f4bb77e51de5ba Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Tue, 29 Oct 2024 17:56:48 +0200 Subject: [PATCH 1/9] Platform changes for rshim and no wait option --- .../sonic_platform/dpuctlplat.py | 73 ++++++++++++------- .../tests/test_dpuctlplat.py | 71 ++++++++++++++++-- 2 files changed, 112 insertions(+), 32 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 12281db1e789..0741e2aa54dd 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -77,6 +77,7 @@ class DpuCtlPlat(): def __init__(self, dpu_name): self.dpu_name = dpu_name self._name = self.get_hwmgmt_name() + self.dpu_id = int(self.dpu_name[3:]) self.rst_path = os.path.join(SYSTEM_BASE, f"{self._name}_rst") self.pwr_path = os.path.join(SYSTEM_BASE, @@ -111,6 +112,8 @@ def __init__(self, dpu_name): self.shtdn_state = None self.dpu_ready_state = None self.setup_logger() + # Use systemd dbus to execute start and stop rshim service + os.environ['DBUS_SESSION_BUS_ADDRESS'] = 'unix:path=/run/dbus/system_bus_socket' self.verbosity = False def setup_logger(self, use_print=False): @@ -134,12 +137,14 @@ def log_info(self, msg=None): def log_error(self, msg=None): self.logger_error(f"{self.dpu_name}: {msg}") - def run_cmd_output(self, cmd): + def run_cmd_output(self, cmd, raise_exception = True): try: - subprocess.check_output(cmd) + return subprocess.check_output(cmd).decode().strip() except Exception as err: - self.log_error(f"Failed to run cmd {' '.join(cmd)}") - raise err + if raise_exception: + raise err + else: + self.log_debug(f"Failed to run cmd {' '.join(cmd)}") def dpu_pre_shutdown(self): """Method to execute shutdown activities for the DPU""" @@ -149,18 +154,22 @@ def dpu_pre_shutdown(self): def dpu_post_startup(self): """Method to execute all post startup activities for the DPU""" self.dpu_pci_scan() - self.wait_for_pci() - self.dpu_rshim_service_control("start") - - def dpu_rshim_service_control(self, set_state): + if self.wait_for_pci(): + self.dpu_rshim_service_control("start") + + def dpu_rshim_service_control(self, op): """Start/Stop the RSHIM service for the current DPU""" try: - cmd = ['systemctl', set_state, dpu_map[self.get_hwmgmt_name()]['rshim'] + ".service"] - self.run_cmd_output(cmd) - self.log_debug(f"Executed rshim service command: {' '.join(cmd)}") - except Exception: - self.log_error(f"Failed to start rshim!") - + rshim_cmd = ["dbus-send", "--dest=org.freedesktop.systemd1", "--type=method_call", + "--print-reply", "--reply-timeout=2000", + "/org/freedesktop/systemd1", + f"org.freedesktop.systemd1.Manager.{op.capitalize()}Unit", + f"string:{dpu_map[self.get_hwmgmt_name()]['rshim']}.service", + "string:replace"] + self.run_cmd_output(rshim_cmd) + except Exception as e: + self.log_error(f"Failed to {op} rshim!: {e}") + @contextmanager def get_open_fd(self, path, flag): fd = os.open(path, flag) @@ -229,12 +238,15 @@ def _power_off_force(self): self.log_info(f"Force Power Off complete") return True - def _power_on_force(self, count=4): + def _power_on_force(self, count=4, no_wait=False): """Per DPU Power on with force private function""" if count < 4: self.log_error(f"Failed Force Power on! Retry {4-count}..") self.write_file(self.pwr_f_path, OperationType.SET.value) self.write_file(self.rst_path, OperationType.SET.value) + if no_wait: + self.log_info("Exiting without checking result of reboot command") + return True get_rdy_inotify = InotifyHelper(self.dpu_rdy_path) with self.time_check_context("power on force"): dpu_rdy = get_rdy_inotify.wait_watch(WAIT_FOR_DPU_READY, 1) @@ -278,7 +290,10 @@ def dpu_power_on(self, forced=False): """Per DPU Power on API""" with self.boot_prog_context(): self.log_info(f"Power on with force = {forced}") - if forced: + if self.read_boot_prog() == BootProgEnum.OS_RUN.value: + self.log_info(f"Skipping DPU power on as DPU is already powered on") + return_value = True + elif forced: return_value = self._power_on_force() else: return_value = self._power_on() @@ -290,18 +305,24 @@ def dpu_power_off(self, forced=False): with self.boot_prog_context(): self.dpu_pre_shutdown() self.log_info(f"Power off with force = {forced}") - if forced: + if self.read_boot_prog() == BootProgEnum.RST.value: + self.log_info(f"Skipping DPU power off as DPU is already powered off") + return True + elif forced: return self._power_off_force() elif self.read_boot_prog() != BootProgEnum.OS_RUN.value: self.log_info(f"Power off with force = True since since OS is not in running state on DPU") return self._power_off_force() return self._power_off() - def _reboot(self): + def _reboot(self, no_wait): """Per DPU Reboot Private function API""" if not self.dpu_go_down(): self._power_off_force() self.write_file(self.rst_path, OperationType.SET.value) + if no_wait: + self.log_info("Exiting without checking result of reboot command") + return True get_rdy_inotify = InotifyHelper(self.dpu_rdy_path) with self.time_check_context("power on"): dpu_rdy = get_rdy_inotify.wait_watch(WAIT_FOR_DPU_READY, 1) @@ -311,25 +332,27 @@ def _reboot(self): return_value = self._power_on_force() return return_value - def _reboot_force(self): + def _reboot_force(self, no_wait): """Per DPU Force Reboot Private function API""" self._power_off_force() - return_value = self._power_on_force() + return_value = self._power_on_force(no_wait=no_wait) return return_value - def dpu_reboot(self, forced=False): + def dpu_reboot(self, forced=False, no_wait=False): """Per DPU Power on API""" with self.boot_prog_context(): self.dpu_pre_shutdown() self.log_info(f"Reboot with force = {forced}") if forced: - return_value = self._reboot_force() + return_value = self._reboot_force(no_wait) elif self.read_boot_prog() != BootProgEnum.OS_RUN.value: self.log_info(f"Reboot with force = True since OS is not in running state on DPU") - return_value = self._reboot_force() + return_value = self._reboot_force(no_wait) else: - return_value = self._reboot() - self.dpu_post_startup() + return_value = self._reboot(no_wait) + # No Post startup as well for no_wait call + if not no_wait: + self.dpu_post_startup() if return_value: self.log_info("Reboot Complete") return return_value diff --git a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py index 7340a0041953..1d17e7504967 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py @@ -108,7 +108,7 @@ def mock_write_file(file_name, content_towrite): assert "0" == written_data[2]["data"] assert written_data[3]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_pwr_force") assert "0" == written_data[3]["data"] - # Test whether value of boot_progress changes power off to force_power_off + # Test whether value of boot_progress skips power off with patch.object(dpuctl_obj, 'read_boot_prog') as mock_boot_prog, \ patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, '_power_off_force') as mock_power_off_force, \ @@ -117,6 +117,16 @@ def mock_write_file(file_name, content_towrite): mock_boot_prog.return_value = BootProgEnum.RST.value mock_add_watch.return_value = True assert dpuctl_obj.dpu_power_off(False) + assert mock_obj.call_args_list[1].args[0] == "Skipping DPU power off as DPU is already powered off" + # Test whether value of boot_progress changes power off to force_power_off + with patch.object(dpuctl_obj, 'read_boot_prog') as mock_boot_prog, \ + patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ + patch.object(dpuctl_obj, '_power_off_force') as mock_power_off_force, \ + patch.object(dpuctl_obj, '_power_off') as mock_power_off, \ + patch.object(dpuctl_obj, 'log_info') as mock_obj: + mock_boot_prog.return_value = BootProgEnum.OS_CRASH_PROG.value + mock_add_watch.return_value = True + assert dpuctl_obj.dpu_power_off(False) assert mock_obj.call_args_list[1].args[0] == "Power off with force = True since since OS is not in running state on DPU" mock_power_off_force.assert_called_once() mock_power_off.assert_not_called() @@ -127,7 +137,6 @@ def mock_write_file(file_name, content_towrite): mock_power_off_force.assert_not_called() mock_power_off.assert_called_once() - @patch('os.path.exists', MagicMock(return_value=True)) @patch('multiprocessing.Process.start', MagicMock(return_value=True)) @patch('multiprocessing.Process.is_alive', MagicMock(return_value=False)) @@ -147,7 +156,8 @@ def mock_write_file(file_name, content_towrite): return True with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, 'wait_for_pci', wraps=MagicMock(return_value=None)), \ - patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)): + patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)), \ + patch.object(dpuctl_obj, 'read_boot_prog', wraps=MagicMock(return_value=BootProgEnum.RST.value)): assert dpuctl_obj.dpu_power_on(True) assert mock_inotify.call_args.args[0].endswith( f"{dpuctl_obj.get_hwmgmt_name()}_ready") @@ -300,7 +310,7 @@ def mock_write_file(file_name, content_towrite): patch.object(dpuctl_obj, '_reboot_force') as mock_reset_force, \ patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)), \ patch.object(dpuctl_obj, 'log_info') as mock_obj: - mock_boot_prog.return_value = BootProgEnum.RST.value + mock_boot_prog.return_vale = BootProgEnum.RST.value mock_add_watch.return_value = True assert dpuctl_obj.dpu_reboot(False) assert mock_obj.call_args_list[1].args[0] == "Reboot with force = True since OS is not in running state on DPU" @@ -312,6 +322,49 @@ def mock_write_file(file_name, content_towrite): assert dpuctl_obj.dpu_reboot(False) mock_reset_force.assert_not_called() mock_reset.assert_called_once() + written_data = [] + mock_inotify.reset_mock() + mock_add_watch.reset_mock() + mock_inotify.return_value = None + mock_add_watch.return_value = True + with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ + patch.object(dpuctl_obj, 'read_boot_prog', MagicMock(return_value=BootProgEnum.OS_RUN.value)), \ + patch.object(dpuctl_obj, 'dpu_rshim_service_control') as mock_rshim: + assert dpuctl_obj.dpu_reboot(forced=False, no_wait=True) + # Rshim service is only stopped and not started + mock_rshim.assert_called_once() + mock_rshim.call_args.args[0] == "stop" + assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert "1" == written_data[0]["data"] + assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") + assert "0" == written_data[1]["data"] + assert written_data[2]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") + assert "1" == written_data[2]["data"] + mock_inotify.called_once() + mock_add_watch.called_once() + written_data = [] + mock_inotify.reset_mock() + mock_add_watch.reset_mock() + mock_inotify.return_value = None + mock_add_watch.return_value = True + with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ + patch.object(dpuctl_obj, 'read_boot_prog', MagicMock(return_value=BootProgEnum.OS_START.value)), \ + patch.object(dpuctl_obj, 'dpu_rshim_service_control') as mock_rshim: + assert dpuctl_obj.dpu_reboot(forced=False, no_wait=True) + mock_rshim.assert_called_once() + mock_rshim.call_args.args[0] == "stop" + assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert "1" == written_data[0]["data"] + assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") + assert "0" == written_data[1]["data"] + assert written_data[2]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_pwr_force") + assert "0" == written_data[2]["data"] + assert written_data[3]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_pwr_force") + assert "1" == written_data[3]["data"] + assert written_data[4]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") + assert "1" == written_data[4]["data"] + mock_inotify.called_once() + mock_add_watch.called_once() def test_prog_update(self): dpuctl_obj = obj["dpuctl_list"][0] @@ -430,16 +483,20 @@ def test_rshim_service(self): dpuctl_obj.dpu_rshim_service_control('start') mock_method.assert_called_once() cmd_string = ' '.join(mock_method.call_args.args[0]) - cmd_string == f"systemctl start {dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('rshim')}.service" + service_name = dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('rshim') + operation = "Start" + assert (operation in cmd_string) and (service_name in cmd_string) mock_method.reset_mock() + operation = "Stop" dpuctl_obj.dpu_rshim_service_control('stop') - cmd_string == f"systemctl stop {dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('rshim')}.service" + cmd_string = ' '.join(mock_method.call_args.args[0]) + assert (operation in cmd_string) and (service_name in cmd_string) mock_method.assert_called_once() with pytest.raises(TypeError): dpuctl_obj.dpu_rshim_service_control() with patch.object(dpuctl_obj, 'get_hwmgmt_name', return_value="dpu5"), patch.object(dpuctl_obj, 'log_error') as mock_obj: dpuctl_obj.dpu_rshim_service_control('start') - mock_obj.assert_called_once_with("Failed to start rshim!") + mock_obj.assert_called_once_with("Failed to start rshim!: 'dpu5'") def test_pre_and_post(self): dpuctl_obj = obj["dpuctl_list"][0] From 0ce2f5e6029faf2396f33462c494bc2e31e092bc Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Thu, 31 Oct 2024 18:49:08 +0200 Subject: [PATCH 2/9] Platform JSON parse for pcie and rshim info --- .../sonic_platform/device_data.py | 32 ++++++-- .../sonic_platform/dpuctlplat.py | 61 ++++++++------- .../tests/test_device_data.py | 78 ++++++++++++++++++- .../tests/test_dpuctlplat.py | 76 ++++++++++++------ 4 files changed, 189 insertions(+), 58 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index e0592fd58feb..d443495880a8 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -19,12 +19,22 @@ import os import time import re +from enum import Enum from . import utils from sonic_py_common.general import check_output_pipe DEFAULT_WD_PERIOD = 65535 + +class DpuInterfaceEnum(Enum): + MIDPLANE_INT = "midplane_interface" + RSHIM_INT = "rshim_info" + PCIE_INT = "bus_info" + + +dpu_interface_values = [item.value for item in DpuInterfaceEnum] + DEVICE_DATA = { 'x86_64-mlnx_msn2700-r0': { 'thermal': { @@ -271,16 +281,26 @@ def get_linecard_max_port_count(cls): @classmethod @utils.read_only_cache() def get_platform_dpus_data(cls): - json_data = cls.get_platform_json_data() + from sonic_py_common import device_info + platform_path = device_info.get_path_to_platform_dir() + platform_json_path = os.path.join(platform_path, 'platform.json') + json_data = utils.load_json_file(platform_json_path) return json_data.get('DPUS', None) + @classmethod + def get_dpu_interface(cls, dpu, interface): + dpu_data = cls.get_platform_dpus_data() + if (not dpu_data) or (interface not in dpu_interface_values): + return None + return dpu_data.get(dpu, {}).get(interface) + @classmethod @utils.read_only_cache() - def get_platform_json_data(cls): - from sonic_py_common import device_info - platform_path = device_info.get_path_to_platform_dir() - platform_json_path = os.path.join(platform_path, 'platform.json') - return utils.load_json_file(platform_json_path) + def get_dpu_count(cls): + dpu_data = cls.get_platform_dpus_data() + if not dpu_data: + return 0 + return len(dpu_data) @classmethod def get_bios_component(cls): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 0741e2aa54dd..603ac55ddf2a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -28,6 +28,7 @@ from .inotify_helper import InotifyHelper from sonic_py_common.syslogger import SysLogger from . import utils + from .device_data import DeviceDataManager, DpuInterfaceEnum except ImportError as e: raise ImportError(str(e)) from e @@ -48,6 +49,7 @@ class OperationType(Enum): CLR = "0" SET = "1" + class BootProgEnum(Enum): RST = 0 BL2 = 1 @@ -63,14 +65,6 @@ class BootProgEnum(Enum): FW_FAULT_DONE = 11 SW_INACTIVE = 15 -# The rshim services are in a different order as compared to the DPU names -dpu_map = { - "dpu1": {"pci_id": "0000:08:00.0", "rshim": "rshim@0"}, - "dpu2": {"pci_id": "0000:07:00.0", "rshim": "rshim@1"}, - "dpu3": {"pci_id": "0000:01:00.0", "rshim": "rshim@2"}, - "dpu4": {"pci_id": "0000:02:00.0", "rshim": "rshim@3"}, -} - class DpuCtlPlat(): """Class for Per DPU API Call""" @@ -90,9 +84,6 @@ def __init__(self, dpu_name): f"{self._name}_shtdn_ready") self.boot_prog_path = os.path.join(HW_BASE, f"{self._name}/system/boot_progress") - self.pci_dev_path = os.path.join(PCI_DEV_BASE, - dpu_map[self._name]["pci_id"], - "remove") self.boot_prog_map = { BootProgEnum.RST.value: "Reset/Boot-ROM", BootProgEnum.BL2.value: "BL2 (from ATF image on eMMC partition)", @@ -128,8 +119,7 @@ def setup_logger(self, use_print=False): def log_debug(self, msg=None): # Print only in verbose mode - if self.verbosity: - self.logger_debug(f"{self.dpu_name}: {msg}") + self.logger_debug(f"{self.dpu_name}: {msg}") def log_info(self, msg=None): self.logger_info(f"{self.dpu_name}: {msg}") @@ -137,7 +127,7 @@ def log_info(self, msg=None): def log_error(self, msg=None): self.logger_error(f"{self.dpu_name}: {msg}") - def run_cmd_output(self, cmd, raise_exception = True): + def run_cmd_output(self, cmd, raise_exception=True): try: return subprocess.check_output(cmd).decode().strip() except Exception as err: @@ -156,20 +146,26 @@ def dpu_post_startup(self): self.dpu_pci_scan() if self.wait_for_pci(): self.dpu_rshim_service_control("start") - + def dpu_rshim_service_control(self, op): """Start/Stop the RSHIM service for the current DPU""" try: - rshim_cmd = ["dbus-send", "--dest=org.freedesktop.systemd1", "--type=method_call", + if not self.rshim_interface: + interface_name = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.RSHIM_INT.value) + if not interface_name: + raise RuntimeError(f"Unable to Parse rshim information for {self.dpu_name} from Platform.json") + # rshim1 -> rshim@1 + self.rshim_interface = interface_name[:5] + "@" + interface_name[5:] + rshim_cmd = ["dbus-send", "--dest=org.freedesktop.systemd1", "--type=method_call", "--print-reply", "--reply-timeout=2000", "/org/freedesktop/systemd1", f"org.freedesktop.systemd1.Manager.{op.capitalize()}Unit", - f"string:{dpu_map[self.get_hwmgmt_name()]['rshim']}.service", + f"string:{self.rshim_interface}.service", "string:replace"] self.run_cmd_output(rshim_cmd) except Exception as e: self.log_error(f"Failed to {op} rshim!: {e}") - + @contextmanager def get_open_fd(self, path, flag): fd = os.open(path, flag) @@ -178,11 +174,20 @@ def get_open_fd(self, path, flag): finally: os.close(fd) + def get_pci_dev_path(self): + """Parse the PCIE device ID from platform.json, raise Runtime error if the device id is not available""" + if not self.pci_dev_path: + pci_dev_id = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.PCIE_INT.value) + if not pci_dev_id: + raise RuntimeError(f"Unable to obtain pci device id for {self.dpu_name} from platform.json") + self.pci_dev_path = os.path.join(PCI_DEV_BASE, pci_dev_id, "remove") + return self.pci_dev_path + def wait_for_pci(self): """Wait for the PCI device folder in the PCI Path, required before starting rshim""" try: with self.get_open_fd(PCI_DEV_BASE, os.O_RDONLY) as dir_fd: - if os.path.exists(os.path.dirname(self.pci_dev_path)): + if os.path.exists(os.path.dirname(self.get_pci_dev_path())): return True poll_obj = poll() poll_obj.register(dir_fd, POLLIN) @@ -190,11 +195,11 @@ def wait_for_pci(self): while (time.time() - start) < WAIT_FOR_PCI_DEV: events = poll_obj.poll(WAIT_FOR_PCI_DEV * 1000) if events: - if os.path.exists(os.path.dirname(self.pci_dev_path)): + if os.path.exists(os.path.dirname(self.get_pci_dev_path())): return True - return os.path.exists(os.path.dirname(self.pci_dev_path)) - except Exception: - self.log_error("Unable to wait for PCI device") + return os.path.exists(os.path.dirname(self.get_pci_dev_path())) + except Exception as e: + self.log_error(f"Unable to wait for PCI device:{e}") def write_file(self, file_name, content_towrite): """Write given value to file only if file exists""" @@ -245,7 +250,7 @@ def _power_on_force(self, count=4, no_wait=False): self.write_file(self.pwr_f_path, OperationType.SET.value) self.write_file(self.rst_path, OperationType.SET.value) if no_wait: - self.log_info("Exiting without checking result of reboot command") + self.log_debug("Exiting without checking result of reboot command") return True get_rdy_inotify = InotifyHelper(self.dpu_rdy_path) with self.time_check_context("power on force"): @@ -277,7 +282,7 @@ def _power_on(self): def dpu_pci_remove(self): """Per DPU PCI remove API""" try: - self.write_file(self.pci_dev_path, OperationType.SET.value) + self.write_file(self.get_pci_dev_path(), OperationType.SET.value) except Exception: self.log_info(f"Failed PCI Removal!") @@ -290,7 +295,7 @@ def dpu_power_on(self, forced=False): """Per DPU Power on API""" with self.boot_prog_context(): self.log_info(f"Power on with force = {forced}") - if self.read_boot_prog() == BootProgEnum.OS_RUN.value: + if self.read_boot_prog() == BootProgEnum.OS_RUN.value: self.log_info(f"Skipping DPU power on as DPU is already powered on") return_value = True elif forced: @@ -305,7 +310,7 @@ def dpu_power_off(self, forced=False): with self.boot_prog_context(): self.dpu_pre_shutdown() self.log_info(f"Power off with force = {forced}") - if self.read_boot_prog() == BootProgEnum.RST.value: + if self.read_boot_prog() == BootProgEnum.RST.value: self.log_info(f"Skipping DPU power off as DPU is already powered off") return True elif forced: @@ -321,7 +326,7 @@ def _reboot(self, no_wait): self._power_off_force() self.write_file(self.rst_path, OperationType.SET.value) if no_wait: - self.log_info("Exiting without checking result of reboot command") + self.log_debug("Exiting without checking result of reboot command") return True get_rdy_inotify = InotifyHelper(self.dpu_rdy_path) with self.time_check_context("power on"): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py index f67793419091..b6267d8fdc87 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,7 +28,7 @@ modules_path = os.path.dirname(test_path) sys.path.insert(0, modules_path) -from sonic_platform.device_data import DeviceDataManager +from sonic_platform.device_data import DeviceDataManager, DpuInterfaceEnum, dpu_interface_values class TestDeviceData: @@ -83,3 +83,77 @@ def test_wait_platform_ready(self, mock_is_indep, mock_exists): assert DeviceDataManager.wait_platform_ready() mock_exists.return_value = False assert not DeviceDataManager.wait_platform_ready() + + @mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp')) + @mock.patch('sonic_platform.device_data.utils.load_json_file') + def test_dpu_count(self, mock_load_json): + mock_value = { + "DPUS": { + "dpu1": { + "interface": {"Ethernet224": "Ethernet0"} + }, + "dpu2": { + "interface": {"Ethernet232": "Ethernet0"} + }, + "dpu3": { + "interface": {"EthernetX": "EthernetY"} + } + }, + } + mock_load_json.return_value = mock_value + return_dict = DeviceDataManager.get_platform_dpus_data() + dpu_data = mock_value["DPUS"] + assert dpu_data == return_dict + mock_load_json.return_value = {} + # Data is Cached + assert DeviceDataManager.get_platform_dpus_data() == mock_value["DPUS"] + assert DeviceDataManager.get_dpu_count() == 3 + + @mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp')) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_dpus_data') + def test_dpu_interface_data(self, mock_load_json): + mock_value = { + "dpu0": { + "midplane_interface": "dpu0", + "interface": { + "Ethernet224": "Ethernet0" + }, + "rshim_info": "rshim0", + "bus_info": "0000:08:00.0" + }, + "dpu1": { + "midplane_interface": "dpu1", + "interface": { + "Ethernet232": "Ethernet0" + }, + "rshim_info": "rshim1", + "bus_info": "0000:07:00.0" + }, + "dpu2": { + "midplane_interface": "dpu2", + "interface": { + "Ethernet240": "Ethernet0" + }, + "rshim_info": "rshim2", + "bus_info": "0000:01:00.0" + }, + "dpu3": { + "midplane_interface": "dpu3", + "interface": { + "Ethernet248": "Ethernet0" + }, + "rshim_info": "rshim3", + "bus_info": "0000:02:00.0" + } + } + mock_load_json.return_value = mock_value + for dpu_name in mock_value: + for dpu_interface in dpu_interface_values: + assert DeviceDataManager.get_dpu_interface(dpu_name, dpu_interface) == mock_value[dpu_name][dpu_interface] + invalid_dpu_names = ["dpu4", "", "dpu"] + invalid_interface_names = ["midplane", "rshim", "bus"] + for interface_name in invalid_interface_names: + assert not DeviceDataManager.get_dpu_interface("dpu0", interface_name) + for dpu_name in invalid_dpu_names: + assert not DeviceDataManager.get_dpu_interface(dpu_name, DpuInterfaceEnum.MIDPLANE_INT.value) + assert not DeviceDataManager.get_dpu_interface("", "") diff --git a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py index 1d17e7504967..12057396a10b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py @@ -19,7 +19,8 @@ import os import sys import pytest -from sonic_platform.dpuctlplat import DpuCtlPlat, dpu_map, BootProgEnum +import sonic_platform +from sonic_platform.dpuctlplat import DpuCtlPlat, BootProgEnum, PCI_DEV_BASE from unittest.mock import MagicMock, patch, Mock, call @@ -45,7 +46,8 @@ def create_dpu_list(): obj = create_dpu_list() - +rshim_interface = "rshim@0" +pci_dev_path = os.path.join(PCI_DEV_BASE, "0000:08:00.0", 'remove') class TestDpuClass: """Tests for dpuctl Platform API Wrapper""" @classmethod @@ -53,6 +55,9 @@ def setup_class(cls): """Setup function for all tests for dpuctl implementation""" os.environ["PATH"] += os.pathsep + scripts_path os.environ["MLNX_PLATFORM_API_DPUCTL_UNIT_TESTING"] = "2" + dpuctl_obj = obj["dpuctl_list"][0] + dpuctl_obj.rshim_interface = rshim_interface + dpuctl_obj.pci_dev_path = pci_dev_path @patch('os.path.exists', MagicMock(return_value=True)) @patch('multiprocessing.Process.start', MagicMock(return_value=True)) @@ -74,8 +79,8 @@ def mock_write_file(file_name, content_towrite): with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, 'read_boot_prog', MagicMock(return_value=BootProgEnum.OS_RUN.value)): assert dpuctl_obj.dpu_power_off(True) - assert written_data[0]["file"].endswith( - f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + print(f"{written_data}") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert "0" == written_data[1]["data"] assert written_data[1]["file"].endswith( @@ -88,7 +93,7 @@ def mock_write_file(file_name, content_towrite): assert mock_inotify.call_args.args[0].endswith( f"{dpuctl_obj.get_hwmgmt_name()}_shtdn_ready") assert written_data[0]["file"].endswith( - f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -100,7 +105,7 @@ def mock_write_file(file_name, content_towrite): assert mock_inotify.call_args.args[0].endswith( f"{dpuctl_obj.get_hwmgmt_name()}_shtdn_ready") assert written_data[0]["file"].endswith( - f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -196,7 +201,7 @@ def mock_write_file(file_name, content_towrite): assert "1" == written_data[18]["data"] @patch('os.path.exists', MagicMock(return_value=True)) - @patch('multiprocessing.Process.start', MagicMock(return_value=True)) + @patch('multiprocessing.Process.start', MagicMock(return_value=None)) @patch('multiprocessing.Process.is_alive', MagicMock(return_value=False)) @patch('sonic_platform.inotify_helper.InotifyHelper.wait_watch') @patch('sonic_platform.inotify_helper.InotifyHelper.__init__') @@ -217,7 +222,7 @@ def mock_write_file(file_name, content_towrite): dpuctl_obj.write_file = mock_write_file assert dpuctl_obj.dpu_reboot(False) assert len(written_data) == 4 - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -231,7 +236,7 @@ def mock_write_file(file_name, content_towrite): written_data = [] assert not dpuctl_obj.dpu_reboot() assert len(written_data) == 22 - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -264,7 +269,7 @@ def mock_write_file(file_name, content_towrite): assert dpuctl_obj.dpu_reboot(True) mock_add_watch.return_value = None assert len(written_data) == 6 - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -282,7 +287,7 @@ def mock_write_file(file_name, content_towrite): written_data = [] assert not dpuctl_obj.dpu_reboot(True) assert len(written_data) == 18 - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -310,7 +315,7 @@ def mock_write_file(file_name, content_towrite): patch.object(dpuctl_obj, '_reboot_force') as mock_reset_force, \ patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)), \ patch.object(dpuctl_obj, 'log_info') as mock_obj: - mock_boot_prog.return_vale = BootProgEnum.RST.value + mock_boot_prog.return_value = BootProgEnum.RST.value mock_add_watch.return_value = True assert dpuctl_obj.dpu_reboot(False) assert mock_obj.call_args_list[1].args[0] == "Reboot with force = True since OS is not in running state on DPU" @@ -334,7 +339,7 @@ def mock_write_file(file_name, content_towrite): # Rshim service is only stopped and not started mock_rshim.assert_called_once() mock_rshim.call_args.args[0] == "stop" - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -353,7 +358,7 @@ def mock_write_file(file_name, content_towrite): assert dpuctl_obj.dpu_reboot(forced=False, no_wait=True) mock_rshim.assert_called_once() mock_rshim.call_args.args[0] == "stop" - assert written_data[0]["file"].endswith(f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}/remove") + assert written_data[0]["file"].endswith(f"{pci_dev_path}") assert "1" == written_data[0]["data"] assert written_data[1]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") assert "0" == written_data[1]["data"] @@ -445,7 +450,8 @@ def mock_time_diff(): with patch("time.time", wraps=mock_time_diff): # PCI Device is not recognized assert not dpuctl_obj.wait_for_pci() - assert f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}" in mock_exists.call_args.args[0] + pci_parent_path = os.path.dirname(pci_dev_path) + assert pci_parent_path == mock_exists.call_args.args[0] mock_obj.register.assert_called_once() mock_obj.poll.assert_called_once() # PCI device is recognized immediately @@ -453,7 +459,7 @@ def mock_time_diff(): mock_exists.reset_mock() mock_exists.return_value = True assert dpuctl_obj.wait_for_pci() - assert f"{dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('pci_id')}" in mock_exists.call_args.args[0] + assert pci_parent_path == mock_exists.call_args.args[0] mock_obj.register.assert_not_called() mock_obj.poll.assert_not_called() # PCI device is added later (Detected in Loop) @@ -473,9 +479,23 @@ def mock_time_diff(): assert dpuctl_obj.wait_for_pci() mock_obj.register.assert_called_once() mock_obj.poll.assert_not_called() - with patch.object(dpuctl_obj, '_name', "dpua"), patch.object(dpuctl_obj, 'log_error') as mock_obj: - dpuctl_obj.wait_for_pci() - mock_obj.assert_called_once_with("Unable to wait for PCI device") + with patch.object(dpuctl_obj, 'pci_dev_path', None), \ + patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ + patch.object(dpuctl_obj, 'log_error') as mock_obj: + mock_int.return_value = None + dpuctl_obj.wait_for_pci() + mock_obj.assert_called_once_with("Unable to wait for PCI device:Unable to obtain pci device id for dpu0 from platform.json") + new_pci_dev_id = "0000:05:00.0" + mock_int.return_value = new_pci_dev_id + dpuctl_obj.wait_for_pci() + assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") + # pci dev_path is cached + mock_int.reset_mock() + mock_int.return_value = "None" + dpuctl_obj.wait_for_pci() + mock_int.assert_not_called() + assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") + def test_rshim_service(self): dpuctl_obj = obj["dpuctl_list"][0] @@ -483,7 +503,7 @@ def test_rshim_service(self): dpuctl_obj.dpu_rshim_service_control('start') mock_method.assert_called_once() cmd_string = ' '.join(mock_method.call_args.args[0]) - service_name = dpu_map.get(dpuctl_obj.get_hwmgmt_name()).get('rshim') + service_name = rshim_interface operation = "Start" assert (operation in cmd_string) and (service_name in cmd_string) mock_method.reset_mock() @@ -494,9 +514,21 @@ def test_rshim_service(self): mock_method.assert_called_once() with pytest.raises(TypeError): dpuctl_obj.dpu_rshim_service_control() - with patch.object(dpuctl_obj, 'get_hwmgmt_name', return_value="dpu5"), patch.object(dpuctl_obj, 'log_error') as mock_obj: + with patch.object(dpuctl_obj, 'rshim_interface', None), \ + patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ + patch.object(dpuctl_obj, 'log_error') as mock_obj: + mock_int.return_value = None + dpuctl_obj.dpu_rshim_service_control('start') + mock_obj.assert_called_once_with("Failed to start rshim!: Unable to Parse rshim information for dpu0 from Platform.json") + mock_int.return_value = "rshim1" + dpuctl_obj.dpu_rshim_service_control('start') + assert dpuctl_obj.rshim_interface == "rshim@1" + mock_int.reset_mock() + mock_int.return_value = "rshim20" dpuctl_obj.dpu_rshim_service_control('start') - mock_obj.assert_called_once_with("Failed to start rshim!: 'dpu5'") + # Rshim name is cached + mock_int.assert_not_called() + assert dpuctl_obj.rshim_interface == "rshim@1" def test_pre_and_post(self): dpuctl_obj = obj["dpuctl_list"][0] From 5d7558c236cef4631935d120631ff2763ff838fa Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Fri, 1 Nov 2024 05:33:32 +0200 Subject: [PATCH 3/9] Addition of skip option for reboot --- .../sonic_platform/dpuctlplat.py | 36 ++++++-- .../tests/test_dpuctlplat.py | 85 +++++++++++++------ 2 files changed, 86 insertions(+), 35 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 603ac55ddf2a..979e5cdafe38 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -138,14 +138,21 @@ def run_cmd_output(self, cmd, raise_exception=True): def dpu_pre_shutdown(self): """Method to execute shutdown activities for the DPU""" - self.dpu_rshim_service_control("stop") - self.dpu_pci_remove() + rshim_op = self.dpu_rshim_service_control("stop") + pci_rem_op = self.dpu_pci_remove() + if rshim_op and pci_rem_op: + return True + return False def dpu_post_startup(self): """Method to execute all post startup activities for the DPU""" - self.dpu_pci_scan() + pci_scan_op = self.dpu_pci_scan() + rshim_op = None if self.wait_for_pci(): - self.dpu_rshim_service_control("start") + rshim_op = self.dpu_rshim_service_control("start") + if rshim_op and pci_scan_op: + return True + return False def dpu_rshim_service_control(self, op): """Start/Stop the RSHIM service for the current DPU""" @@ -163,8 +170,11 @@ def dpu_rshim_service_control(self, op): f"string:{self.rshim_interface}.service", "string:replace"] self.run_cmd_output(rshim_cmd) + # If command fails execution exception is raised , return true if control is still in try block + return True except Exception as e: self.log_error(f"Failed to {op} rshim!: {e}") + return False @contextmanager def get_open_fd(self, path, flag): @@ -283,13 +293,20 @@ def dpu_pci_remove(self): """Per DPU PCI remove API""" try: self.write_file(self.get_pci_dev_path(), OperationType.SET.value) + return True except Exception: self.log_info(f"Failed PCI Removal!") + return False def dpu_pci_scan(self): """PCI Scan API""" - pci_scan_path = "/sys/bus/pci/rescan" - self.write_file(pci_scan_path, OperationType.SET.value) + try: + pci_scan_path = "/sys/bus/pci/rescan" + self.write_file(pci_scan_path, OperationType.SET.value) + return True + except Exception: + self.log_info(f"Failed to rescan") + return False def dpu_power_on(self, forced=False): """Per DPU Power on API""" @@ -343,10 +360,11 @@ def _reboot_force(self, no_wait): return_value = self._power_on_force(no_wait=no_wait) return return_value - def dpu_reboot(self, forced=False, no_wait=False): + def dpu_reboot(self, forced=False, no_wait=False, skip_pre_post=False): """Per DPU Power on API""" with self.boot_prog_context(): - self.dpu_pre_shutdown() + if not skip_pre_post: + self.dpu_pre_shutdown() self.log_info(f"Reboot with force = {forced}") if forced: return_value = self._reboot_force(no_wait) @@ -356,7 +374,7 @@ def dpu_reboot(self, forced=False, no_wait=False): else: return_value = self._reboot(no_wait) # No Post startup as well for no_wait call - if not no_wait: + if (not no_wait) and (not skip_pre_post): self.dpu_post_startup() if return_value: self.log_info("Reboot Complete") diff --git a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py index 12057396a10b..35a74f0d6862 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py @@ -48,6 +48,8 @@ def create_dpu_list(): rshim_interface = "rshim@0" pci_dev_path = os.path.join(PCI_DEV_BASE, "0000:08:00.0", 'remove') + + class TestDpuClass: """Tests for dpuctl Platform API Wrapper""" @classmethod @@ -113,7 +115,7 @@ def mock_write_file(file_name, content_towrite): assert "0" == written_data[2]["data"] assert written_data[3]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_pwr_force") assert "0" == written_data[3]["data"] - # Test whether value of boot_progress skips power off + # Test whether value of boot_progress skips power off with patch.object(dpuctl_obj, 'read_boot_prog') as mock_boot_prog, \ patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, '_power_off_force') as mock_power_off_force, \ @@ -141,7 +143,7 @@ def mock_write_file(file_name, content_towrite): assert dpuctl_obj.dpu_power_off(False) mock_power_off_force.assert_not_called() mock_power_off.assert_called_once() - + @patch('os.path.exists', MagicMock(return_value=True)) @patch('multiprocessing.Process.start', MagicMock(return_value=True)) @patch('multiprocessing.Process.is_alive', MagicMock(return_value=False)) @@ -162,7 +164,7 @@ def mock_write_file(file_name, content_towrite): with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, 'wait_for_pci', wraps=MagicMock(return_value=None)), \ patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)), \ - patch.object(dpuctl_obj, 'read_boot_prog', wraps=MagicMock(return_value=BootProgEnum.RST.value)): + patch.object(dpuctl_obj, 'read_boot_prog', wraps=MagicMock(return_value=BootProgEnum.RST.value)): assert dpuctl_obj.dpu_power_on(True) assert mock_inotify.call_args.args[0].endswith( f"{dpuctl_obj.get_hwmgmt_name()}_ready") @@ -261,7 +263,7 @@ def mock_write_file(file_name, content_towrite): mock_inotify.reset_mock() mock_add_watch.return_value = True mock_inotify.return_value = None - written_data=[] + written_data = [] with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ patch.object(dpuctl_obj, 'read_boot_prog', MagicMock(return_value=BootProgEnum.OS_RUN.value)), \ patch.object(dpuctl_obj, 'dpu_rshim_service_control', wraps=MagicMock(return_value=None)): @@ -370,6 +372,17 @@ def mock_write_file(file_name, content_towrite): assert "1" == written_data[4]["data"] mock_inotify.called_once() mock_add_watch.called_once() + # Skip pre startup and post shutdown + written_data = [] + with patch.object(dpuctl_obj, 'write_file', wraps=mock_write_file), \ + patch.object(dpuctl_obj, 'read_boot_prog', MagicMock(return_value=BootProgEnum.OS_START.value)), \ + patch.object(dpuctl_obj, 'dpu_rshim_service_control') as mock_rshim: + assert dpuctl_obj.dpu_reboot(skip_pre_post=True) + mock_rshim.assert_not_called() + # We skip writing PCI data + assert written_data[0]["file"].endswith(f"{dpuctl_obj.get_hwmgmt_name()}_rst") + assert "0" == written_data[0]["data"] + assert not written_data[-1]["file"].endswith("rescan") def test_prog_update(self): dpuctl_obj = obj["dpuctl_list"][0] @@ -480,22 +493,21 @@ def mock_time_diff(): mock_obj.register.assert_called_once() mock_obj.poll.assert_not_called() with patch.object(dpuctl_obj, 'pci_dev_path', None), \ - patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ - patch.object(dpuctl_obj, 'log_error') as mock_obj: - mock_int.return_value = None - dpuctl_obj.wait_for_pci() - mock_obj.assert_called_once_with("Unable to wait for PCI device:Unable to obtain pci device id for dpu0 from platform.json") - new_pci_dev_id = "0000:05:00.0" - mock_int.return_value = new_pci_dev_id - dpuctl_obj.wait_for_pci() - assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") - # pci dev_path is cached - mock_int.reset_mock() - mock_int.return_value = "None" - dpuctl_obj.wait_for_pci() - mock_int.assert_not_called() - assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") - + patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ + patch.object(dpuctl_obj, 'log_error') as mock_obj: + mock_int.return_value = None + dpuctl_obj.wait_for_pci() + mock_obj.assert_called_once_with("Unable to wait for PCI device:Unable to obtain pci device id for dpu0 from platform.json") + new_pci_dev_id = "0000:05:00.0" + mock_int.return_value = new_pci_dev_id + dpuctl_obj.wait_for_pci() + assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") + # pci dev_path is cached + mock_int.reset_mock() + mock_int.return_value = "None" + dpuctl_obj.wait_for_pci() + mock_int.assert_not_called() + assert dpuctl_obj.pci_dev_path.endswith(f"{new_pci_dev_id}/remove") def test_rshim_service(self): dpuctl_obj = obj["dpuctl_list"][0] @@ -505,18 +517,18 @@ def test_rshim_service(self): cmd_string = ' '.join(mock_method.call_args.args[0]) service_name = rshim_interface operation = "Start" - assert (operation in cmd_string) and (service_name in cmd_string) + assert (operation in cmd_string) and (service_name in cmd_string) mock_method.reset_mock() operation = "Stop" dpuctl_obj.dpu_rshim_service_control('stop') cmd_string = ' '.join(mock_method.call_args.args[0]) - assert (operation in cmd_string) and (service_name in cmd_string) + assert (operation in cmd_string) and (service_name in cmd_string) mock_method.assert_called_once() with pytest.raises(TypeError): dpuctl_obj.dpu_rshim_service_control() with patch.object(dpuctl_obj, 'rshim_interface', None), \ - patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ - patch.object(dpuctl_obj, 'log_error') as mock_obj: + patch('sonic_platform.device_data.DeviceDataManager.get_dpu_interface') as mock_int,\ + patch.object(dpuctl_obj, 'log_error') as mock_obj: mock_int.return_value = None dpuctl_obj.dpu_rshim_service_control('start') mock_obj.assert_called_once_with("Failed to start rshim!: Unable to Parse rshim information for dpu0 from Platform.json") @@ -526,7 +538,7 @@ def test_rshim_service(self): mock_int.reset_mock() mock_int.return_value = "rshim20" dpuctl_obj.dpu_rshim_service_control('start') - # Rshim name is cached + # Rshim name is cached mock_int.assert_not_called() assert dpuctl_obj.rshim_interface == "rshim@1" @@ -536,12 +548,17 @@ def test_pre_and_post(self): manager_mock = Mock() manager_mock.attach_mock(mock_rshim, 'rshim') manager_mock.attach_mock(mock_write, 'write') - dpuctl_obj.dpu_pre_shutdown() + mock_rshim.return_value = True + mock_write.return_value = True + assert dpuctl_obj.dpu_pre_shutdown() mock_rshim.assert_called_once() mock_write.assert_called_once() # Confirm the order of calls and the parameters manager_mock.mock_calls[0] == call.rshim('stop') manager_mock.mock_calls[1] == call.rshim(dpuctl_obj.pci_dev_path, '1') + mock_rshim.return_value = False + assert not dpuctl_obj.dpu_pre_shutdown() + mock_rshim.return_value = True # Test post startup mock_rshim.reset_mock() mock_write.reset_mock() @@ -558,6 +575,22 @@ def test_pre_and_post(self): manager_mock.mock_calls[0] == call.rshim('/sys/bus/pci/rescan', '1') manager_mock.mock_calls[1] == call.pci() manager_mock.mock_calls[2] == call.rshim('start') + mock_rshim.return_value = False + assert not dpuctl_obj.dpu_post_startup() + with patch.object(dpuctl_obj, 'write_file', side_effect=Exception("Mock")), \ + patch.object(dpuctl_obj, 'run_cmd_output', MagicMock(return_value=True)): + assert not dpuctl_obj.dpu_pre_shutdown() + with patch.object(dpuctl_obj, 'run_cmd_output', side_effect=Exception("Mock")), \ + patch.object(dpuctl_obj, 'dpu_pci_remove', MagicMock(return_value=True)): + assert not dpuctl_obj.dpu_pre_shutdown() + with patch.object(dpuctl_obj, 'write_file', side_effect=Exception("Mock")), \ + patch.object(dpuctl_obj, 'wait_for_pci', MagicMock(return_value=True)), \ + patch.object(dpuctl_obj, 'run_cmd_output', MagicMock(return_value=True)): + assert not dpuctl_obj.dpu_post_startup() + with patch.object(dpuctl_obj, 'run_cmd_output', side_effect=Exception("Mock")), \ + patch.object(dpuctl_obj, 'wait_for_pci', MagicMock(return_value=True)), \ + patch.object(dpuctl_obj, 'dpu_pci_scan', MagicMock(return_value=True)): + assert not dpuctl_obj.dpu_post_startup() @classmethod def teardown_class(cls): From 6718aee414d71ec818984c70b01dacc1dfe98c84 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Wed, 13 Nov 2024 14:28:38 +0000 Subject: [PATCH 4/9] Updating headers --- .../mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py | 3 ++- platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 979e5cdafe38..4d43daf61204 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -1,5 +1,6 @@ # -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py index 35a74f0d6862..34c7870cfd85 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_dpuctlplat.py @@ -1,5 +1,6 @@ # -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 24e749e10d20908b11c5a14c11a92ae3cd1135fb Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Wed, 13 Nov 2024 14:31:45 +0000 Subject: [PATCH 5/9] Fixing test issues --- .../mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 4d43daf61204..0b2cff037c45 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -104,6 +104,8 @@ def __init__(self, dpu_name): self.shtdn_state = None self.dpu_ready_state = None self.setup_logger() + self.pci_dev_path = None + self.rshim_interface = None # Use systemd dbus to execute start and stop rshim service os.environ['DBUS_SESSION_BUS_ADDRESS'] = 'unix:path=/run/dbus/system_bus_socket' self.verbosity = False From 158e8661d2f0bf2cde47c6118fd280f85e4007e4 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Wed, 13 Nov 2024 15:32:53 +0000 Subject: [PATCH 6/9] added parser --- .../sonic_platform/dpu_vpd_parser.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/dpu_vpd_parser.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpu_vpd_parser.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpu_vpd_parser.py new file mode 100644 index 000000000000..3347a55aa560 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpu_vpd_parser.py @@ -0,0 +1,46 @@ +# +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .vpd_parser import VpdParser + + +class DpuVpdParser(VpdParser): + """DPU Specific VPD parser""" + def __init__(self, file_path, dpu_name): + super(DpuVpdParser, self).__init__(file_path=file_path) + self.dpu_name = dpu_name + + def get_dpu_data(self, key=None): + """Retrieves VPD Entry for DPU Specific Key""" + return self.get_entry_value(f"{self.dpu_name}_{key}") + + def get_dpu_base_mac(self): + """Retrieves VPD Entry for DPU Specific Mac Address""" + return self.get_dpu_data("BASE_MAC") + + def get_dpu_serial(self): + """Retrieves VPD Entry for DPU Specific Serial Number""" + return self.get_dpu_data("SN") + + def get_dpu_revision(self): + """Retrieves VPD Entry for DPU Specific Revision""" + return self.get_dpu_data("REV") + + def get_dpu_model(self): + """Retrieves VPD Entry for DPU Specific Model Number""" + return self.get_dpu_data("PN") From 13b9f79d6bdcb8c0cd2731c25396878ef8b68263 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Sat, 23 Nov 2024 01:01:18 +0000 Subject: [PATCH 7/9] Updated rshim get method --- .../mlnx-platform-api/sonic_platform/dpuctlplat.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index 0b2cff037c45..c73320337d4f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -157,20 +157,24 @@ def dpu_post_startup(self): return True return False - def dpu_rshim_service_control(self, op): - """Start/Stop the RSHIM service for the current DPU""" - try: - if not self.rshim_interface: + def get_rshim_interface(self): + """Parse the rshim interface from platform.json, raise Runtime error if the device id is not available""" + if not self.rshim_interface: interface_name = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.RSHIM_INT.value) if not interface_name: raise RuntimeError(f"Unable to Parse rshim information for {self.dpu_name} from Platform.json") # rshim1 -> rshim@1 self.rshim_interface = interface_name[:5] + "@" + interface_name[5:] + return self.rshim_interface + + def dpu_rshim_service_control(self, op): + """Start/Stop the RSHIM service for the current DPU""" + try: rshim_cmd = ["dbus-send", "--dest=org.freedesktop.systemd1", "--type=method_call", "--print-reply", "--reply-timeout=2000", "/org/freedesktop/systemd1", f"org.freedesktop.systemd1.Manager.{op.capitalize()}Unit", - f"string:{self.rshim_interface}.service", + f"string:{self.get_rshim_interface()}.service", "string:replace"] self.run_cmd_output(rshim_cmd) # If command fails execution exception is raised , return true if control is still in try block From 2bd00a9fa0790c8f58c18ec6418b389690460ed0 Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Tue, 26 Nov 2024 01:49:08 +0000 Subject: [PATCH 8/9] Updated return value --- .../mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py index c73320337d4f..b66f1be2daad 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py @@ -143,9 +143,7 @@ def dpu_pre_shutdown(self): """Method to execute shutdown activities for the DPU""" rshim_op = self.dpu_rshim_service_control("stop") pci_rem_op = self.dpu_pci_remove() - if rshim_op and pci_rem_op: - return True - return False + return rshim_op and pci_rem_op def dpu_post_startup(self): """Method to execute all post startup activities for the DPU""" From e1cf674b544cebb126880d5e1ed775d2cd28771e Mon Sep 17 00:00:00 2001 From: gpunathilell Date: Sun, 1 Dec 2024 03:07:19 +0000 Subject: [PATCH 9/9] Fix headers --- .../mellanox/mlnx-platform-api/sonic_platform/device_data.py | 5 +++-- .../mellanox/mlnx-platform-api/tests/test_device_data.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index d443495880a8..2f2eee7b871c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -1,6 +1,7 @@ # -# Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. -# Apache-2.0 +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py index b6267d8fdc87..f34cb4e471e7 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py @@ -1,6 +1,7 @@ # -# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. -# Apache-2.0 +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.