Skip to content

Commit

Permalink
[SWDEV-499029] Fix unable to change memory partition modes
Browse files Browse the repository at this point in the history
Changes:
  * [API] Removed checking board name, fixes for other MI ASICs
  * [CLI] Increased progress bar to change memory partition modes
    to 140 seconds, since driver reload is variable per system

Change-Id: Ifcaf40d28b4adf5eaa800c9e3748d33749dc414a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
(cherry picked from commit d04cec7)
  • Loading branch information
charis-poag-amd committed Nov 25, 2024
1 parent 55c0f58 commit 94441cb
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 47 deletions.
4 changes: 2 additions & 2 deletions python_smi_tools/rocm_smi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,7 +1863,7 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond):
(', '.join(map(str, memory_partition_type_l))) ))
return (None, None)

kTimeWait = 40
kTimeWait = 140
t1 = multiprocessing.Process(target=showProgressbar,
args=("Updating memory partition",kTimeWait,))
t1.start()
Expand Down Expand Up @@ -3885,7 +3885,7 @@ def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond):
******WARNING******\n
Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
ROCm SMI will then attempt to change memory (NPS) partition mode.
Upon a successful set, ROCm SMI will then initiate an action to restart amdgpu driver.
Upon a successful set, ROCm SMI will then initiate an action to restart AMD GPU driver.
This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
Please use this utility with caution.
Expand Down
33 changes: 5 additions & 28 deletions src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4901,6 +4901,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
const int k1000_MS_WAIT = 1000;
const uint32_t kMaxBoardLength = 128;
bool isCorrectDevice = false;
char boardName[kMaxBoardLength];
Expand All @@ -4914,32 +4915,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
char current_memory_mode[kMaxCurrentMemoryMode];
current_memory_mode[0] = '\0';

// rsmi_dev_memory_partition_set is only available for for discrete variant,
// others are required to update through bios settings
rsmi_dev_name_get(dv_ind, boardName, static_cast<size_t>(kMaxBoardLength));
std::string myBoardName = boardName;
if (!myBoardName.empty()) {
std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(),
::tolower);
if (myBoardName.find("mi") != std::string::npos &&
myBoardName.find("00x") != std::string::npos) {
isCorrectDevice = true;
}
}

if (!isCorrectDevice) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: device board name does not support this action"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false);
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}

// Is the current mode already what user requested?
switch (memory_partition) {
Expand Down Expand Up @@ -5075,6 +5050,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Returning = "
<< getRSMIStatusString(restartRet, false);
LOG_TRACE(ss);

if (restartRet != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
Expand All @@ -5092,10 +5068,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
std::string current_memory_mode_str = "unknown";
rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR;
int maxWaitSeconds = 10;
const int k1000_MS_WAIT = 1000;
// wait until we can read SYSFS again
if (restartRet == RSMI_STATUS_SUCCESS) {
while (current_memory_mode_str != user_requested_memory_partition) {
while ((current_memory_mode_str != user_requested_memory_partition)
&& maxWaitSeconds > 0) {
maxWaitSeconds -= 1;
can_read_sysfs_again =
rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode);
Expand All @@ -5111,6 +5087,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Data (user requested mode): " << user_requested_memory_partition
<< " | Current Memory Partition Mode: " << current_memory_mode_str
<< " | Available Memory Partition Modes: " << memory_capabilities_str
<< " | maxWaitSeconds: " << maxWaitSeconds
<< " | total wait time (sec): " << (10 - maxWaitSeconds)
<< " | Returning = "
<< getRSMIStatusString(can_read_sysfs_again, false);
Expand Down
56 changes: 39 additions & 17 deletions src/rocm_smi_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1391,38 +1391,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
bool restartInProgress = true;
bool isRestartInProgress = true;
bool isAMDGPUModuleLive = false;
bool restartGDM = false;
std::string captureRestartErr;
const int kTimeToWaitForDriverMSec = 1000;

// sudo systemctl is-active gdm
// we do not care about the success of checking if gdm is active
std::tie(success, out) = executeCommand("systemctl is-active gdm");
(out == "active") ? (restartSuccessful &= success) :
(restartSuccessful = true);
std::tie(success, out) = executeCommand("systemctl is-active gdm", true);
(out == "active") ? (restartGDM = true) : (restartGDM = false);
ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);

// if gdm is active -> sudo systemctl stop gdm
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
if (success && (out == "active")) {
if (success && (out == "active") && (restartGDM)) {
wasGdmServiceActive = true;
std::tie(success, out) = executeCommand("systemctl stop gdm&", false);
restartSuccessful &= success;
std::tie(success, out) = executeCommand("systemctl stop gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
} else {
success = true; // ignore failures to restart gdm
}

ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);

// sudo modprobe -r amdgpu
// sudo modprobe amdgpu
std::tie(success, out) =
executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true);
std::tie(success, out) = executeCommand(
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
restartSuccessful &= success;
captureRestartErr = out;

if (success) {
restartSuccessful = false;
}
ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);

// if gdm was active -> sudo systemctl start gdm
if (wasGdmServiceActive) {
std::tie(success, out) = executeCommand("systemctl start gdm&", false);
restartSuccessful &= success;
// We don't care if successful or not, just try to restart as a courtesy
if (wasGdmServiceActive && restartGDM) {
std::tie(success, out) = executeCommand("systemctl start gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
}

// Return early if there was an issue restarting amdgpu
Expand All @@ -1436,7 +1454,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
// wait for amdgpu module to come back up
rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress,
&isAMDGPUModuleLive);
const int kTimeToWaitForDriverMSec = 1000;
int maxLoops = 10; // wait a max of 10 sec
while (status != RSMI_STATUS_SUCCESS) {
maxLoops -= 1;
Expand Down Expand Up @@ -1467,7 +1484,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
// wait for amdgpu module to come back up
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
ss << __PRETTY_FUNCTION__
<< " | success = " << success
<< " | success = " << (success ? "True" : "False")
<< " | out = " << out;
LOG_DEBUG(ss);
if ((success == true) && (!out.empty())) {
Expand All @@ -1478,6 +1495,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
}
*isRestartInProgress = deviceRestartInProgress;
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
ss << __PRETTY_FUNCTION__
<< " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
<< " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
<< " | out = " << out;
LOG_DEBUG(ss);

return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);
Expand Down

0 comments on commit 94441cb

Please sign in to comment.