From 94441cbe0488ad410bed24bd8fbf78ca4d599231 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 21 Nov 2024 23:28:03 -0600 Subject: [PATCH] [SWDEV-499029] Fix unable to change memory partition modes Changes: * [API] Removed checking board name, fixes for other MI ASICs * [CLI] Increased progress bar to change memory partition modes to 140 seconds, since driver reload is variable per system Change-Id: Ifcaf40d28b4adf5eaa800c9e3748d33749dc414a Signed-off-by: Charis Poag (cherry picked from commit d04cec7f1ddee741cdb5be8e9893bac89f5bbad1) --- python_smi_tools/rocm_smi.py | 4 +-- src/rocm_smi.cc | 33 ++++----------------- src/rocm_smi_device.cc | 56 +++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 47 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index bb294f8..58a93d6 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1863,7 +1863,7 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond): (', '.join(map(str, memory_partition_type_l))) )) return (None, None) - kTimeWait = 40 + kTimeWait = 140 t1 = multiprocessing.Process(target=showProgressbar, args=("Updating memory partition",kTimeWait,)) t1.start() @@ -3885,7 +3885,7 @@ def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond): ******WARNING******\n Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads. ROCm SMI will then attempt to change memory (NPS) partition mode. - Upon a successful set, ROCm SMI will then initiate an action to restart amdgpu driver. + Upon a successful set, ROCm SMI will then initiate an action to restart AMD GPU driver. This action will change all GPU's in the hive to the requested memory (NPS) partition mode. Please use this utility with caution. diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 68fe5ed..50e2f31 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -4901,6 +4901,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX + const int k1000_MS_WAIT = 1000; const uint32_t kMaxBoardLength = 128; bool isCorrectDevice = false; char boardName[kMaxBoardLength]; @@ -4914,32 +4915,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, char current_memory_mode[kMaxCurrentMemoryMode]; current_memory_mode[0] = '\0'; - // rsmi_dev_memory_partition_set is only available for for discrete variant, - // others are required to update through bios settings - rsmi_dev_name_get(dv_ind, boardName, static_cast(kMaxBoardLength)); - std::string myBoardName = boardName; - if (!myBoardName.empty()) { - std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(), - ::tolower); - if (myBoardName.find("mi") != std::string::npos && - myBoardName.find("00x") != std::string::npos) { - isCorrectDevice = true; - } - } - - if (!isCorrectDevice) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " - << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) - << " | Cause: device board name does not support this action" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false); - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } // Is the current mode already what user requested? switch (memory_partition) { @@ -5075,6 +5050,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Returning = " << getRSMIStatusString(restartRet, false); LOG_TRACE(ss); + if (restartRet != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -5092,10 +5068,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, std::string current_memory_mode_str = "unknown"; rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR; int maxWaitSeconds = 10; - const int k1000_MS_WAIT = 1000; // wait until we can read SYSFS again if (restartRet == RSMI_STATUS_SUCCESS) { - while (current_memory_mode_str != user_requested_memory_partition) { + while ((current_memory_mode_str != user_requested_memory_partition) + && maxWaitSeconds > 0) { maxWaitSeconds -= 1; can_read_sysfs_again = rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode); @@ -5111,6 +5087,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Data (user requested mode): " << user_requested_memory_partition << " | Current Memory Partition Mode: " << current_memory_mode_str << " | Available Memory Partition Modes: " << memory_capabilities_str + << " | maxWaitSeconds: " << maxWaitSeconds << " | total wait time (sec): " << (10 - maxWaitSeconds) << " | Returning = " << getRSMIStatusString(can_read_sysfs_again, false); diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 4ef1a97..578bb8a 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -1391,38 +1391,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { bool restartInProgress = true; bool isRestartInProgress = true; bool isAMDGPUModuleLive = false; + bool restartGDM = false; std::string captureRestartErr; + const int kTimeToWaitForDriverMSec = 1000; // sudo systemctl is-active gdm // we do not care about the success of checking if gdm is active - std::tie(success, out) = executeCommand("systemctl is-active gdm"); - (out == "active") ? (restartSuccessful &= success) : - (restartSuccessful = true); + std::tie(success, out) = executeCommand("systemctl is-active gdm", true); + (out == "active") ? (restartGDM = true) : (restartGDM = false); + ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); // if gdm is active -> sudo systemctl stop gdm // TODO(AMD_SMI_team): are are there other display manager's we need to take into account? // see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB - if (success && (out == "active")) { + if (success && (out == "active") && (restartGDM)) { wasGdmServiceActive = true; - std::tie(success, out) = executeCommand("systemctl stop gdm&", false); - restartSuccessful &= success; + std::tie(success, out) = executeCommand("systemctl stop gdm&", true); + ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); + } else { + success = true; // ignore failures to restart gdm } + ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = " + << out << "; success = " << (success ? "True" : "False") + << "; restartSuccessful = " << (restartSuccessful ? "True" : "False") + << "; captureRestartErr = " << captureRestartErr; + LOG_INFO(ss); + // sudo modprobe -r amdgpu // sudo modprobe amdgpu - std::tie(success, out) = - executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true); + std::tie(success, out) = executeCommand( + "modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true); restartSuccessful &= success; captureRestartErr = out; - - if (success) { - restartSuccessful = false; - } + ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = " + << out << "; success = " << (success ? "True" : "False") + << "; restartSuccessful = " << (restartSuccessful ? "True" : "False") + << "; captureRestartErr = " << captureRestartErr; + LOG_INFO(ss); // if gdm was active -> sudo systemctl start gdm - if (wasGdmServiceActive) { - std::tie(success, out) = executeCommand("systemctl start gdm&", false); - restartSuccessful &= success; + // We don't care if successful or not, just try to restart as a courtesy + if (wasGdmServiceActive && restartGDM) { + std::tie(success, out) = executeCommand("systemctl start gdm&", true); + ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); } // Return early if there was an issue restarting amdgpu @@ -1436,7 +1454,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { // wait for amdgpu module to come back up rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress, &isAMDGPUModuleLive); - const int kTimeToWaitForDriverMSec = 1000; int maxLoops = 10; // wait a max of 10 sec while (status != RSMI_STATUS_SUCCESS) { maxLoops -= 1; @@ -1467,7 +1484,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, // wait for amdgpu module to come back up std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true); ss << __PRETTY_FUNCTION__ - << " | success = " << success + << " | success = " << (success ? "True" : "False") << " | out = " << out; LOG_DEBUG(ss); if ((success == true) && (!out.empty())) { @@ -1478,6 +1495,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, } *isRestartInProgress = deviceRestartInProgress; *isAMDGPUModuleLive = isSystemAMDGPUModuleLive; + ss << __PRETTY_FUNCTION__ + << " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False") + << " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False") + << " | out = " << out; + LOG_DEBUG(ss); return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS : RSMI_STATUS_AMDGPU_RESTART_ERR);