From 55c0f585889890e368e1518f39c41c4708447cc2 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Wed, 6 Nov 2024 15:13:32 -0600 Subject: [PATCH] [SWDEV-488276/SWDEV-497613] Update memory partition set functionality Changes: - Added warning screen to ROCm SMI users setting memory partition - Added new API (rsmi_dev_memory_partition_capabilities_get) to retrieve memory partition capabilities (What users can set memory partition modes to) - Increased time-bar for CLI sets display to 40 seconds - API now waits until the driver reloads with SYSFS files active - [SWDEV-475712] [CLI/API] Fixed target_graphics_version field not properly displaying for MI2x or Navi 3x ASICs. - Updated tests Change-Id: Iaf89d1b7ad9ceb449b289bc82ea198fe3b23992e Signed-off-by: Charis Poag (cherry picked from commit 46902274b6c3ee76ec6d7a2d3ec927e3dde01267) --- CHANGELOG.md | 24 +- include/rocm_smi/rocm_smi.h | 33 +++ include/rocm_smi/rocm_smi_device.h | 6 +- include/rocm_smi/rocm_smi_utils.h | 5 +- python_smi_tools/rocm_smi.py | 72 ++++- src/rocm_smi.cc | 272 +++++++++++++++--- src/rocm_smi_device.cc | 125 ++++++-- src/rocm_smi_main.cc | 4 +- src/rocm_smi_utils.cc | 50 +++- .../functional/memorypartition_read_write.cc | 127 +++++--- 10 files changed, 595 insertions(+), 123 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index faf4ff0..e9e5968 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ### Added +- **Added `rsmi_dev_memory_partition_capabilities_get` which returns driver memory partition capablities.** +Driver now has the ability to report what the user can set memory partition modes to. User can now see available +memory partition modes upon an invalid argument return from memory partition mode set (`rsmi_dev_memory_partition_set`). + + - **Added support for GPU metrics 1.6 to `rsmi_dev_gpu_metrics_info_get()`** Updated `rsmi_dev_gpu_metrics_info_get()` and structure `rsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery: - `uint64_t accumulation_counter` - used for all throttled calculations @@ -27,17 +32,26 @@ Updated `rsmi_dev_gpu_metrics_info_get()` and structure `rsmi_gpu_metrics_t` to - **Added ability to view raw GPU metrics`rocm-smi --showmetrics`** Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD SMI (or other ROCM-SMI interfaces), these values are ***not*** converted into applicable units as users may see in `amd-smi metric`. Units listed display as indicated by the driver, they are not converted (eg. in other AMD SMI/ROCm SMI interfaces which use the data provided). It is important to note, that fields displaying `N/A` data mean this ASIC does not support or backward compatibility was not provided in a newer ASIC's GPU metric structure. +### Changed + +- **Added back in C++ tests for `memorypartition_read_write`**. +Due to driver adding in all needed features for memory partition write. We have re-enabled memorypartition_read_write. + +- **Updated `rsmi_dev_memory_partition_set` to not return until a successful restart of AMD GPU Driver.** +This change keeps checking for ~ up to 40 seconds for a successful restart of the AMD GPU driver. Additionally, the API call continues to check if memory partition (NPS) SYSFS files are successfully updated to reflect the user's requested memory partition (NPS) mode change. Otherwise, reports an error back to the user. Due to these changes, we have updated ROCm SMI's CLI to reflect the maximum wait of 40 seconds, while memory partition change is in progress. + +- **All APIs now have the ability to catch driver reporting invalid arguments.** +Now ROCm SMI APIs can show RSMI_STATUS_INVALID_ARGS when driver returns EINVAL. + ### Removed - **Removed `--resetcomputepartition`, and `--resetmemorypartition` options and associated APIs**. - This change is part of the partition feature redesign. - The related APIs `rsmi_dev_compute_partition_reset()` and `rsmi_dev_memory_partition_reset()`. -- **Temporary Disabled C++ tests for `memorypartition_read_write`**. - - This change is part of the partition feature redesign. - - SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable - Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm - 6.4. +### Resolved issues + +- **Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying properly for MI2x or Navi 3x ASICs.** ### Upcoming changes diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 54c9624..fe6a373 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -4181,6 +4181,39 @@ rsmi_status_t rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, uint32_t len); +/** + * @brief Retrieves the available memory partition capabilities + * for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p memory_partition_caps , + * and uint32 @p len , this function will attempt to obtain the device's + * available memory partition capabilities string. Upon successful + * retreival, the obtained device's available memory partition capablilities + * string shall be stored in the passed @p memory_partition_caps + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] memory_partition_caps a pointer to a char string variable, + * which the device's available memory partition capabilities will be written to. + * + * @param[in] len the length of the caller provided buffer @p len , + * suggested length is 30 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t rsmi_dev_memory_partition_capabilities_get( + uint32_t dv_ind, char *memory_partition_caps, uint32_t len); + /** * @brief Modifies a selected device's current memory partition setting. * diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 0101a2c..ea2abb1 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -172,7 +172,8 @@ enum DevInfoTypes { kDevGpuReset, kDevAvailableComputePartition, kDevComputePartition, - kDevMemoryPartition + kDevMemoryPartition, + kDevAvailableMemoryPartition, }; typedef struct { @@ -227,6 +228,8 @@ class Device { bool DeviceAPISupported(std::string name, uint64_t variant, uint64_t sub_variant); rsmi_status_t restartAMDGpuDriver(void); + rsmi_status_t isRestartInProgress(bool *isRestartInProgress, + bool *isAMDGPUModuleLive); rsmi_status_t storeDevicePartitions(uint32_t dv_ind); template std::string readBootPartitionState(uint32_t dv_ind); rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); @@ -244,6 +247,7 @@ class Device { static const std::map devInfoTypesStrings; void set_smi_device_id(uint32_t i) { m_device_id = i; } void set_smi_partition_id(uint32_t i) { m_partition_id = i; } + static const char* get_type_string(DevInfoTypes type); private: std::shared_ptr monitor_; diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index abf5e98..0cb75a0 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -90,7 +90,8 @@ std::pair executeCommand(std::string command, rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, std::string stateName, std::string storageData); std::vector getListOfAppTmpFiles(); -bool containsString(std::string originalString, std::string substring); +bool containsString(std::string originalString, std::string substring, + bool displayComparisons = false); std::tuple readTmpFile( uint32_t dv_ind, std::string stateName, @@ -138,6 +139,8 @@ std::string removeNewLines(const std::string &s); std::string removeString(const std::string origStr, const std::string &removeMe); +void system_wait(int milli_seconds); +int countDigit(uint64_t n); template std::string print_int_as_hex(T i, bool showHexNotation = true, int overloadBitSize = 0) { diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 3877d65..bb294f8 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -391,11 +391,16 @@ def getTargetGfxVersion(device, silent=False): :param silent: Turn on to silence error output (you plan to handle manually). Default is off. """ - gfx_version = c_uint64() + target_graphics_version = c_uint64() + market_name = str(getDeviceName(device, True)) gfx_ver_ret = "N/A" - ret = rocmsmi.rsmi_dev_target_graphics_version_get(device, byref(gfx_version)) + ret = rocmsmi.rsmi_dev_target_graphics_version_get(device, byref(target_graphics_version)) + target_graphics_version = str(target_graphics_version.value) if rsmi_ret_ok(ret, device, 'get_target_gfx_version', silent=silent): - gfx_ver_ret = "gfx" + str(gfx_version.value) + if len(target_graphics_version) == 4 and ("Instinct MI2" in market_name): + hex_part = str(hex(int(str(target_graphics_version)[2:]))).replace("0x", "") + target_graphics_version = str(target_graphics_version)[:2] + hex_part + gfx_ver_ret = "gfx" + str(target_graphics_version) return gfx_ver_ret def getNodeId(device, silent=False): @@ -753,6 +758,19 @@ def getMemoryPartition(device, silent=True): return str(currentMemoryPartition.value.decode()) return "N/A" +def getMemoryPartitionCapabilities(device, silent=True): + """ Return the current memory partition capablities of a given device + + :param device: DRM device identifier + :param silent: Turn on to silence error output + (you plan to handle manually). Default is on. + """ + memoryPartitionCapabilities = create_string_buffer(MAX_BUFF_SIZE) + ret = rocmsmi.rsmi_dev_memory_partition_capabilities_get(device, memoryPartitionCapabilities, MAX_BUFF_SIZE) + if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and memoryPartitionCapabilities.value.decode(): + return str(memoryPartitionCapabilities.value.decode()) + return "N/A" + def print2DArray(dataArray): """ Print 2D Array with uniform spacing """ @@ -1823,14 +1841,20 @@ def showProgressbar(title="", timeInSeconds=13): time.sleep(1) -def setMemoryPartition(deviceList, memoryPartition): +def setMemoryPartition(deviceList, memoryPartition, autoRespond): """ Sets memory partition (memory partition) for a list of devices :param deviceList: List of DRM devices (can be a single-item list) :param memoryPartition: Memory Partition type to set as """ + addExtraLine=False printLogSpacer(' Set memory partition to %s ' % (str(memoryPartition).upper())) + confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond) for device in deviceList: + current_memory_partition = getMemoryPartition(device, silent=True) + if current_memory_partition == 'N/A': + printLog(device, 'Not supported on the given system', None, addExtraLine) + continue memoryPartition = memoryPartition.upper() if memoryPartition not in memory_partition_type_l: printErrLog(device, 'Invalid memory partition type %s' @@ -1839,8 +1863,9 @@ def setMemoryPartition(deviceList, memoryPartition): (', '.join(map(str, memory_partition_type_l))) )) return (None, None) + kTimeWait = 40 t1 = multiprocessing.Process(target=showProgressbar, - args=("Updating memory partition",13,)) + args=("Updating memory partition",kTimeWait,)) t1.start() addExtraLine=True start=time.time() @@ -1862,12 +1887,19 @@ def setMemoryPartition(deviceList, memoryPartition): printLog(device, 'Permission denied', None, addExtraLine) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_INVALID_ARGS: + printLog(device, 'Device does not support setting to ' + str(memoryPartition).upper(), None, addExtraLine) + memory_partition_caps = getMemoryPartitionCapabilities(device, silent=True) + printLog(device, 'Available memory partition modes: ' + str(memory_partition_caps).upper(), None, addExtraLine) elif ret == rsmi_status_t.RSMI_STATUS_BUSY: printLog(device, 'Device is currently busy, try again later', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: + printLog(device, 'Issue reloading driver, please check dmsg for errors', + None, addExtraLine) else: rsmi_ret_ok(ret, device, 'set_memory_partition') - printErrLog(device, 'Failed to retrieve memory partition, even though device supports it.') + printErrLog(device, 'Failed to set memory partition, even though device supports it.') printLogSpacer() def showVersion(isCSV=False): @@ -3844,6 +3876,32 @@ def confirmOutOfSpecWarning(autoRespond): else: sys.exit('Confirmation not given. Exiting without setting value') +def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond): + """ Print the warning for running outside of specification and prompt user to accept the terms. + + :param autoRespond: Response to automatically provide for all prompts + """ + print(''' + ******WARNING******\n + Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads. + ROCm SMI will then attempt to change memory (NPS) partition mode. + Upon a successful set, ROCm SMI will then initiate an action to restart amdgpu driver. + This action will change all GPU's in the hive to the requested memory (NPS) partition mode. + + Please use this utility with caution. + ''') + if not autoRespond: + user_input = input('Do you accept these terms? [Y/N] ') + else: + user_input = autoRespond + if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']: + print('') + return + else: + print('Confirmation not given. Exiting without setting value') + printLogSpacer() + sys.exit(1) + def doesDeviceExist(device): """ Check whether the specified device exists @@ -4503,7 +4561,7 @@ def isConciseInfoRequested(args): if args.setcomputepartition: setComputePartition(deviceList, args.setcomputepartition[0]) if args.setmemorypartition: - setMemoryPartition(deviceList, args.setmemorypartition[0]) + setMemoryPartition(deviceList, args.setmemorypartition[0], args.autorespond) if args.resetprofile: resetProfile(deviceList) if args.resetxgmierr: diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 274fa3c..68fe5ed 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -3262,7 +3262,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, ss << __PRETTY_FUNCTION__ << " | inside success fallback... " << " | Device #: " << std::to_string(dv_ind) - << " | Type = " << devInfoTypesStrings.at(mem_type_file) + << " | Type = " << amd::smi::Device::get_type_string(mem_type_file) << " | Data: total = " << std::to_string(*total) << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); LOG_DEBUG(ss); @@ -3273,7 +3273,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, ss << __PRETTY_FUNCTION__ << " | after fallback... " << " | Device #: " << std::to_string(dv_ind) - << " | Type = " << devInfoTypesStrings.at(mem_type_file) + << " | Type = " << amd::smi::Device::get_type_string(mem_type_file) << " | Data: total = " << std::to_string(*total) << " | ret = " << getRSMIStatusString(ret); LOG_DEBUG(ss); @@ -3322,7 +3322,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, ss << __PRETTY_FUNCTION__ << " no fallback needed! - " << " | Device #: " << std::to_string(dv_ind) - << " | Type = " << devInfoTypesStrings.at(mem_type_file) + << " | Type = " << amd::smi::Device::get_type_string(mem_type_file) << " | Data: Used = " << std::to_string(*used) << " | Data: total = " << std::to_string(total) << " | ret = " << getRSMIStatusString(ret); @@ -3333,7 +3333,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, ss << __PRETTY_FUNCTION__ << " | in fallback == success ..." << " | Device #: " << std::to_string(dv_ind) - << " | Type = " << devInfoTypesStrings.at(mem_type_file) + << " | Type = " << amd::smi::Device::get_type_string(mem_type_file) << " | Data: Used = " << std::to_string(*used) << " | Data: total = " << std::to_string(total) << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); @@ -3344,7 +3344,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, ss << __PRETTY_FUNCTION__ << " | at end!!!! after fallback ..." << " | Device #: " << std::to_string(dv_ind) - << " | Type = " << devInfoTypesStrings.at(mem_type_file) + << " | Type = " << amd::smi::Device::get_type_string(mem_type_file) << " | Data: Used = " << std::to_string(*used) << " | ret = " << getRSMIStatusString(ret); LOG_DEBUG(ss); @@ -4629,7 +4629,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Cause: len was 0 or compute_partition variable was null" << " | Returning = " << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; @@ -4648,7 +4648,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Cause: could not retrieve current compute partition" << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -4665,7 +4665,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Cause: requested size was insufficient" << " | Returning = " << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; @@ -4677,7 +4677,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, << " | Success " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Data: " << compute_partition << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -4704,7 +4704,7 @@ is_available_compute_partition(uint32_t dv_ind, << " | FAIL " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) << " | Data: could not retrieve requested data" << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -4723,7 +4723,7 @@ is_available_compute_partition(uint32_t dv_ind, << " | Success " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) << " | Data: available_partitions = " << availableComputePartitions << " | Data: isComputePartitionAvailable = " << (isComputePartitionAvailable ? "True" : "False") @@ -4766,7 +4766,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Data: " << newComputePartitionStr << " | Cause: requested setting was invalid" << " | Returning = " @@ -4785,7 +4785,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Data: " << newComputePartitionStr << " | Cause: not an available compute partition setting" << " | Returning = " @@ -4805,7 +4805,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Cause: could retrieve current compute partition or retrieved" << " unexpected data" << " | Returning = " @@ -4821,7 +4821,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Success - compute partition was already set at requested value" << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Data: " << newComputePartitionStr << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; @@ -4847,7 +4847,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Success " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition) << " | Data: " << newComputePartitionStr << " | Returning = " << getRSMIStatusString(returnResponse) << " |"; @@ -4901,12 +4901,22 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX + const uint32_t kMaxBoardLength = 128; bool isCorrectDevice = false; - char boardName[128]; + char boardName[kMaxBoardLength]; boardName[0] = '\0'; + + const uint32_t kMaxMemoryCapabilitiesSize = 30; + char available_memory_capabilities[kMaxMemoryCapabilitiesSize]; + available_memory_capabilities[0] = '\0'; + + const uint32_t kMaxCurrentMemoryMode = 5; + char current_memory_mode[kMaxCurrentMemoryMode]; + current_memory_mode[0] = '\0'; + // rsmi_dev_memory_partition_set is only available for for discrete variant, // others are required to update through bios settings - rsmi_dev_name_get(dv_ind, boardName, 128); + rsmi_dev_name_get(dv_ind, boardName, static_cast(kMaxBoardLength)); std::string myBoardName = boardName; if (!myBoardName.empty()) { std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(), @@ -4919,18 +4929,19 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, if (!isCorrectDevice) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) - << " | Cause: device board name does not support this action" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) + << " | Cause: device board name does not support this action" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false); LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } + // Is the current mode already what user requested? switch (memory_partition) { case RSMI_MEMORY_PARTITION_NPS1: case RSMI_MEMORY_PARTITION_NPS2: @@ -4944,10 +4955,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: requested setting was invalid" << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS, false); LOG_ERROR(ss); return RSMI_STATUS_INVALID_ARGS; } @@ -4965,11 +4976,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: could retrieve current memory partition or retrieved" << " unexpected data" << " | Returning = " - << getRSMIStatusString(ret_get) << " |"; + << getRSMIStatusString(ret_get, false); LOG_ERROR(ss); return ret_get; } @@ -4982,14 +4993,55 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " setting" << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Data: " << newMemoryPartition << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + << getRSMIStatusString(RSMI_STATUS_SUCCESS, false); LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; } + // is this an available mode to set to? + std::string memory_capabilities_str = "unknown"; + std::string user_requested_memory_partition = newMemoryPartition; + std::transform(user_requested_memory_partition.begin(), user_requested_memory_partition.end(), + user_requested_memory_partition.begin(), ::toupper); + rsmi_status_t caps_ret = rsmi_dev_memory_partition_capabilities_get(dv_ind, + available_memory_capabilities, kMaxMemoryCapabilitiesSize); + memory_capabilities_str = available_memory_capabilities; + std::transform(memory_capabilities_str.begin(), memory_capabilities_str.end(), + memory_capabilities_str.begin(), ::toupper); + ss << __PRETTY_FUNCTION__ << " | user_requested_memory_partition: " + << user_requested_memory_partition + << "; memory_capabilities_str: " << memory_capabilities_str + << "; rsmi_dev_memory_partition_capabilities_get(" << dv_ind + << ", " << user_requested_memory_partition << "): return = " + << amd::smi::getRSMIStatusString(caps_ret, false); + LOG_DEBUG(ss); + if ((caps_ret == RSMI_STATUS_SUCCESS) + && (!memory_capabilities_str.empty()) + && (!user_requested_memory_partition.empty())) { + bool is_available_mode = amd::smi::containsString(memory_capabilities_str, + user_requested_memory_partition, true); + ss << __PRETTY_FUNCTION__ + << " | is_available_mode: " << (is_available_mode ? "True": "False"); + LOG_DEBUG(ss); + if (is_available_mode == false) { // report RSMI_STATUS_INVALID_ARGS + ss << __PRETTY_FUNCTION__ + << " | ======= Check if available mode ======= " + << " | WARNING: detected invalid mode to set to, will try to set anyways" + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) + << " | Data (user requested mode): " << user_requested_memory_partition + << " | Available Memory Partition Modes: " << memory_capabilities_str + << " | Cause: requested setting was not an available mode" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS, false); + LOG_INFO(ss); + } + } + GET_DEV_FROM_INDX int ret = dev->writeDevInfo(amd::smi::kDevMemoryPartition, newMemoryPartition); @@ -5004,10 +5056,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: issue writing reqested setting of " + newMemoryPartition << " | Returning = " - << getRSMIStatusString(err) << " |"; + << getRSMIStatusString(err, false); LOG_ERROR(ss); return err; } @@ -5018,11 +5070,79 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Success - if restart completed successfully" << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Data: " << newMemoryPartition << " | Returning = " - << getRSMIStatusString(restartRet) << " |"; + << getRSMIStatusString(restartRet, false); LOG_TRACE(ss); + if (restartRet != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail - restart AMD GPU detected" + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) + << " | Cause: issue writing reqested setting of " + newMemoryPartition + << " | Returning = " + << getRSMIStatusString(restartRet, false); + LOG_ERROR(ss); + return restartRet; + } + + std::string current_memory_mode_str = "unknown"; + rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR; + int maxWaitSeconds = 10; + const int k1000_MS_WAIT = 1000; + // wait until we can read SYSFS again + if (restartRet == RSMI_STATUS_SUCCESS) { + while (current_memory_mode_str != user_requested_memory_partition) { + maxWaitSeconds -= 1; + can_read_sysfs_again = + rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode); + if (can_read_sysfs_again == RSMI_STATUS_SUCCESS) { + current_memory_mode_str.clear(); + current_memory_mode_str = current_memory_mode; + ss << __PRETTY_FUNCTION__ + << " | ======= rsmi_dev_memory_partition_get ======= " + << " | Success - can read SYSFS" + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) + << " | Data (user requested mode): " << user_requested_memory_partition + << " | Current Memory Partition Mode: " << current_memory_mode_str + << " | Available Memory Partition Modes: " << memory_capabilities_str + << " | total wait time (sec): " << (10 - maxWaitSeconds) + << " | Returning = " + << getRSMIStatusString(can_read_sysfs_again, false); + LOG_TRACE(ss); + if (!current_memory_mode_str.empty() + && (current_memory_mode_str == user_requested_memory_partition)) { + break; + } + } + amd::smi::system_wait(k1000_MS_WAIT); + } + } + + if (current_memory_mode_str == user_requested_memory_partition) { + restartRet = RSMI_STATUS_SUCCESS; + } else { + restartRet = RSMI_STATUS_AMDGPU_RESTART_ERR; + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success - completed driver restart and all SYSFS are active" + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) + << " | Data: " << user_requested_memory_partition + << " | Current Memory Partition Mode: " << current_memory_mode_str + << " | Available Memory Partition Modes: " << memory_capabilities_str + << " | Returning = " + << getRSMIStatusString(restartRet, false); + LOG_TRACE(ss); + return restartRet; CATCH } @@ -5040,7 +5160,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: user sent invalid arguments, len = 0 or memory partition" << " was a null ptr" << " | Returning = " @@ -5060,7 +5180,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: could not successfully retrieve current memory partition " << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -5078,7 +5198,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, << " | Fail " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Cause: could not successfully retrieve current memory partition " << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -5090,7 +5210,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, << " | Success " << " | Device #: " << dv_ind << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) + << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) << " | Data: " << memory_partition << " | Returning = " << getRSMIStatusString(ret) << " |"; @@ -5099,6 +5219,78 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, CATCH } +rsmi_status_t rsmi_dev_memory_partition_capabilities_get( + uint32_t dv_ind, char *memory_partition_caps, uint32_t len) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + + if ((len == 0) || (memory_partition_caps == nullptr)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableMemoryPartition) + << " | Cause: user sent invalid arguments, len = 0 or memory_partition_caps" + << " was a null ptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS, false); + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + CHK_SUPPORT_NAME_ONLY(memory_partition_caps) + DEVICE_MUTEX + + std::string availableMemoryPartitions; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevAvailableMemoryPartition, + dv_ind, &availableMemoryPartitions); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableMemoryPartition) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret, false); + LOG_ERROR(ss); + return ret; + } + + std::size_t length = availableMemoryPartitions.copy(memory_partition_caps, len-1); + memory_partition_caps[length]='\0'; + + if (len < (availableMemoryPartitions.size() + 1)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableMemoryPartition) + << " | Cause: requested size was insufficient" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE, false); + LOG_ERROR(ss); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableMemoryPartition) + << " | Data: " << memory_partition_caps + << " | Returning = " + << getRSMIStatusString(ret, false); + LOG_TRACE(ss); + return ret; + CATCH +} + rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { TRY diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 8f2f170..4ef1a97 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -128,6 +128,7 @@ static const char *kDevAvailableComputePartitionFName = "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; +static const char *kDevAvailableMemoryPartitionFName = "available_memory_partition"; // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -306,6 +307,7 @@ static const std::map kDevAttribNameMap = { {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, {kDevComputePartition, kDevComputePartitionFName}, {kDevMemoryPartition, kDevMemoryPartitionFName}, + {kDevAvailableMemoryPartition, kDevAvailableMemoryPartitionFName}, }; static const std::map kDevPerfLvlMap = { @@ -452,6 +454,7 @@ Device::devInfoTypesStrings = { {kDevAvailableComputePartition, "kDevAvailableComputePartition"}, {kDevComputePartition, "kDevComputePartition"}, {kDevMemoryPartition, "kDevMemoryPartition"}, + {kDevAvailableMemoryPartition, "kDevAvailableMemoryPartition"}, }; static const std::map kDevFuncDependsMap = { @@ -680,7 +683,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (ret != 0) { ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << get_type_string(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -689,7 +692,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Issue: File is not a regular file - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << ")," + << "DevInfoInfoType (" << get_type_string(type) << ")," << " returning ENOENT (" << std::strerror(ENOENT) << ")"; LOG_ERROR(ss); return ENOENT; @@ -700,7 +703,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (!fs->is_open()) { ss << __PRETTY_FUNCTION__ << " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), " + << "DevInfoInfoType (" << get_type_string(type) << "), " << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; LOG_ERROR(ss); @@ -709,7 +712,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << get_type_string(type) << ")"; LOG_INFO(ss); return 0; @@ -726,7 +729,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { ret = openDebugFileStream(type, &fs); if (ret != 0) { ss << "Could not read debugInfoStr for DevInfoType (" - << devInfoTypesStrings.at(type)<< "), returning " + << get_type_string(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -740,7 +743,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { fs.close(); ss << "Successfully read debugInfoStr for DevInfoType (" - << devInfoTypesStrings.at(type)<< "), retString= " << *retStr; + << get_type_string(type) << "), retString= " << *retStr; LOG_INFO(ss); return 0; @@ -756,7 +759,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read device info string for DevInfoType (" - << devInfoTypesStrings.at(type) << "), returning " + << get_type_string(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -765,8 +768,8 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { fs >> *retStr; fs.close(); ss << __PRETTY_FUNCTION__ - << "Successfully read device info string for DevInfoType (" << - devInfoTypesStrings.at(type) << "): " + *retStr + << "Successfully read device info string for DevInfoType (" + << get_type_string(type) << "): " + *retStr << " | " << (fs.is_open() ? " File stream is opened" : " File stream is closed") << " | " << (fs.bad() ? "[ERROR] Bad read operation" : @@ -801,7 +804,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << devInfoTypesStrings.at(type) + << ") for DevInfoType (" << get_type_string(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -812,7 +815,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.flush(); fs.close(); ss << "Successfully wrote device info string (" << valStr - << ") for DevInfoType (" << devInfoTypesStrings.at(type) + << ") for DevInfoType (" << get_type_string(type) << "), returning RSMI_STATUS_SUCCESS"; LOG_INFO(ss); ret = RSMI_STATUS_SUCCESS; @@ -826,7 +829,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << devInfoTypesStrings.at(type) + << ") for DevInfoType (" << get_type_string(type) << "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret)); ss << " | " << (fs.is_open() ? "[ERROR] File stream open" : @@ -913,20 +916,29 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read DevInfoLine for DevInfoType (" - << devInfoTypesStrings.at(type) << ")"; + << get_type_string(type) << ")"; LOG_ERROR(ss); return ret; } std::getline(fs, *line); ss << "Successfully read DevInfoLine for DevInfoType (" - << devInfoTypesStrings.at(type) << "), returning *line = " + << get_type_string(type) << "), returning *line = " << *line; LOG_INFO(ss); return 0; } +const char* Device::get_type_string(DevInfoTypes type) { + auto ite = devInfoTypesStrings.find(type); + if (ite != devInfoTypesStrings.end()) { + return ite->second; + } + + return "Unknown"; +} + int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data) { auto sysfs_path = path_; @@ -939,7 +951,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { ss << "Could not read DevInfoBinary for DevInfoType (" - << devInfoTypesStrings.at(type) << ")" + << get_type_string(type) << ")" << " - SYSFS (" << sysfs_path << ")" << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; @@ -951,7 +963,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, fclose(ptr); if ((num*b_size) != b_size) { ss << "Could not read DevInfoBinary for DevInfoType (" - << devInfoTypesStrings.at(type) << ") - SYSFS (" + << get_type_string(type) << ") - SYSFS (" << sysfs_path << "), binary size error; " << "[buff: " << p_binary_data @@ -966,7 +978,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, } if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { ss << "Successfully read DevInfoBinary for DevInfoType (" - << devInfoTypesStrings.at(type) << ") - SYSFS (" + << get_type_string(type) << ") - SYSFS (" << sysfs_path << "), returning binaryData = " << p_binary_data << "; byte_size = " << std::dec << static_cast(b_size); @@ -999,7 +1011,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" - << devInfoTypesStrings.at(type) << ")" + << get_type_string(type) << ")" << ", but contained no string lines"; LOG_ERROR(ss); return ENXIO; @@ -1017,12 +1029,12 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (!allLines.empty()) { ss << "Successfully read devInfoMultiLineStr for DevInfoType (" - << devInfoTypesStrings.at(type) << ") " + << get_type_string(type) << ") " << ", returning lines read = " << allLines; LOG_INFO(ss); } else { ss << "Read devInfoMultiLineStr for DevInfoType (" - << devInfoTypesStrings.at(type) << ")" + << get_type_string(type) << ")" << ", but lines were empty"; LOG_INFO(ss); return ENXIO; @@ -1193,6 +1205,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevMemoryPartition: case kDevNumaNode: case kDevXGMIPhysicalID: + case kDevAvailableMemoryPartition: return readDevInfoStr(type, val); break; @@ -1370,10 +1383,15 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, rsmi_status_t Device::restartAMDGpuDriver(void) { REQUIRE_ROOT_ACCESS + std::ostringstream ss; bool restartSuccessful = true; bool success = false; std::string out; bool wasGdmServiceActive = false; + bool restartInProgress = true; + bool isRestartInProgress = true; + bool isAMDGPUModuleLive = false; + std::string captureRestartErr; // sudo systemctl is-active gdm // we do not care about the success of checking if gdm is active @@ -1382,8 +1400,8 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { (restartSuccessful = true); // if gdm is active -> sudo systemctl stop gdm - // TODO: are are there other display manager's we need to take into account? - // see https://en.wikipedia.org/wiki/GNOME_Display_Manager + // TODO(AMD_SMI_team): are are there other display manager's we need to take into account? + // see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB if (success && (out == "active")) { wasGdmServiceActive = true; std::tie(success, out) = executeCommand("systemctl stop gdm&", false); @@ -1393,8 +1411,13 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { // sudo modprobe -r amdgpu // sudo modprobe amdgpu std::tie(success, out) = - executeCommand("modprobe -r amdgpu && modprobe amdgpu&", false); + executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true); restartSuccessful &= success; + captureRestartErr = out; + + if (success) { + restartSuccessful = false; + } // if gdm was active -> sudo systemctl start gdm if (wasGdmServiceActive) { @@ -1402,7 +1425,61 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { restartSuccessful &= success; } - return (restartSuccessful ? RSMI_STATUS_SUCCESS : + // Return early if there was an issue restarting amdgpu + if (!restartSuccessful) { + ss << __PRETTY_FUNCTION__ << " | [WARNING] Issue found during amdgpu restart: " + << captureRestartErr << "; retartSuccessful: " << (restartSuccessful ? "True" : "False"); + LOG_INFO(ss); + return RSMI_STATUS_AMDGPU_RESTART_ERR; + } + + // wait for amdgpu module to come back up + rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress, + &isAMDGPUModuleLive); + const int kTimeToWaitForDriverMSec = 1000; + int maxLoops = 10; // wait a max of 10 sec + while (status != RSMI_STATUS_SUCCESS) { + maxLoops -= 1; + if (maxLoops == 0) { + break; + } + amd::smi::system_wait(kTimeToWaitForDriverMSec); + status = Device::isRestartInProgress(&isRestartInProgress, + &isAMDGPUModuleLive); + } + + return ((restartSuccessful && (!isRestartInProgress && isAMDGPUModuleLive)) ? + RSMI_STATUS_SUCCESS : + RSMI_STATUS_AMDGPU_RESTART_ERR); +} + +rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, + bool *isAMDGPUModuleLive) { + REQUIRE_ROOT_ACCESS + std::ostringstream ss; + bool restartSuccessful = true; + bool success = false; + std::string out; + bool deviceRestartInProgress = true; // Assume in progress, we intend to disprove + bool isSystemAMDGPUModuleLive = false; // Assume AMD GPU module is not live, + // we intend to disprove + + // wait for amdgpu module to come back up + std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true); + ss << __PRETTY_FUNCTION__ + << " | success = " << success + << " | out = " << out; + LOG_DEBUG(ss); + if ((success == true) && (!out.empty())) { + isSystemAMDGPUModuleLive = containsString(out, "live"); + } + if (isAMDGPUModuleLive) { + deviceRestartInProgress = false; + } + *isRestartInProgress = deviceRestartInProgress; + *isAMDGPUModuleLive = isSystemAMDGPUModuleLive; + + return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS : RSMI_STATUS_AMDGPU_RESTART_ERR); } diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 8e3d246..0185fed 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -567,9 +567,9 @@ std::string RocmSMI::getRSMIEnvVarInfo(void) { for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - ss << (std::to_string(*it) + " (" + Device::devInfoTypesStrings.at(type) + ")"); + ss << (std::to_string(*it) + " (" + Device::get_type_string(type) + ")"); auto temp_it = it; - if(++temp_it != env_vars_.enum_overrides.end()) { + if (++temp_it != env_vars_.enum_overrides.end()) { ss << ", "; } } diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 8437a27..55d4bc2 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -63,6 +63,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -325,6 +326,7 @@ rsmi_status_t ErrnoToRsmiStatus(int err) { case EIO: return RSMI_STATUS_UNEXPECTED_SIZE; case ENXIO: return RSMI_STATUS_UNEXPECTED_DATA; case EBUSY: return RSMI_STATUS_BUSY; + case EINVAL: return RSMI_STATUS_INVALID_ARGS; default: return RSMI_STATUS_UNKNOWN_ERROR; } } @@ -397,14 +399,14 @@ std::pair executeCommand(std::string command, bool stdOut) { char buffer[128]; std::string stdoutAndErr; bool successfulRun = true; - command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering + command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering FILE *pipe = popen(command.c_str(), "r"); if (!pipe) { stdoutAndErr = "[ERROR] popen failed to call " + command; successfulRun = false; } else { - //read until end of process + // read until end of process while (!feof(pipe)) { // use buffer to read and add to stdoutAndErr if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { @@ -427,8 +429,19 @@ std::pair executeCommand(std::string command, bool stdOut) { // originalString - string to search for substring // substring - string looking to find -bool containsString(std::string originalString, std::string substring) { - return (originalString.find(substring) != std::string::npos); +// displayComparisons = defaults to false, set to true to see debug prints +bool containsString(std::string originalString, std::string substring, + bool displayComparisons) { + std::ostringstream ss; + bool found = originalString.find(substring) != std::string::npos; + if (displayComparisons) { + ss << __PRETTY_FUNCTION__ + << " | originalString: " << originalString + << " | substring: " << substring + << " | found: " << (found ? "True": "False"); + LOG_TRACE(ss); + } + return found; } // Creates and stores supplied data into a temporary file (within /tmp/). @@ -1162,7 +1175,9 @@ rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_vers // separate out parts -> put back into normal graphics version format major = static_cast((orig_target_version / 10000) * 100); minor = static_cast((orig_target_version % 10000 / 100) * 10); - if (minor == 0) major *= 10; // 0 as a minor is correct, but bump up by 10 + if ((minor == 0) && (countDigit(major) < 4)) { + major *= 10; // 0 as a minor is correct, but bump up by 10 + } rev = static_cast(orig_target_version % 100); *gfx_version = "gfx" + std::to_string(major + minor + rev); ss << __PRETTY_FUNCTION__ @@ -1204,6 +1219,31 @@ std::queue getAllDeviceGfxVers() { return deviceGfxVersions; } +// milli_seconds: time to wait, in milliseconds +// 1 sec = 1000ms +// .5 sec = 500ms +void system_wait(int milli_seconds) { + std::ostringstream ss; + auto start = std::chrono::high_resolution_clock::now(); + // 1 ms = 1000 us + int waitTime = milli_seconds * 1000; + ss << __PRETTY_FUNCTION__ << " | " + << "** Waiting for " << std::dec << waitTime + << " us (" << waitTime/1000 << " milli-seconds) **"; + LOG_DEBUG(ss); + usleep(waitTime); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(stop - start); + ss << __PRETTY_FUNCTION__ << " | " + << "** Waiting took " << duration.count() / 1000 + << " milli-seconds **"; + LOG_DEBUG(ss); +} + +int countDigit(uint64_t n) { + return static_cast(std::floor(log10(n) + 1)); +} } // namespace smi } // namespace amd diff --git a/tests/rocm_smi_test/functional/memorypartition_read_write.cc b/tests/rocm_smi_test/functional/memorypartition_read_write.cc index 2dbc460..81b7161 100755 --- a/tests/rocm_smi_test/functional/memorypartition_read_write.cc +++ b/tests/rocm_smi_test/functional/memorypartition_read_write.cc @@ -111,13 +111,13 @@ mapStringToRSMIMemoryPartitionTypes { }; void TestMemoryPartitionReadWrite::Run(void) { - GTEST_SKIP_("Temporarily disabled"); // Skipped due to SWDEV-491215 - - // will be re-enabled in rocm 6.4 - rsmi_status_t ret, err; + rsmi_status_t ret, err, ret_set; char orig_memory_partition[255]; char current_memory_partition[255]; + char current_memory_capabilities[255]; orig_memory_partition[0] = '\0'; current_memory_partition[0] = '\0'; + current_memory_capabilities[0] = '\0'; TestBase::Run(); if (setup_failed_) { @@ -179,7 +179,18 @@ void TestMemoryPartitionReadWrite::Run(void) { if (err == RSMI_STATUS_INVALID_ARGS) { IF_VERB(STANDARD) { - std::cout << "\t**" + std::cout << "\t**rsmi_dev_memory_partition_get(dv_ind, nullptr, 255): " + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } + } + + err = rsmi_dev_memory_partition_capabilities_get(dv_ind, nullptr, 255); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_memory_partition_capabilities_get(dv_ind, nullptr, 255): " << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." << std::endl; } @@ -187,11 +198,24 @@ void TestMemoryPartitionReadWrite::Run(void) { // Verify api support checking functionality is working err = rsmi_dev_memory_partition_get(dv_ind, orig_memory_partition, 0); + ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || + (err == RSMI_STATUS_NOT_SUPPORTED)); + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_memory_partition_get(dv_ind, orig_memory_partition, 0): " + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } + } + + err = rsmi_dev_memory_partition_capabilities_get(dv_ind, current_memory_capabilities, 0); ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || (err == RSMI_STATUS_NOT_SUPPORTED)); if (err == RSMI_STATUS_INVALID_ARGS) { IF_VERB(STANDARD) { std::cout << "\t**" + << "rsmi_dev_memory_partition_capabilities_get(dv_ind, " + << "current_memory_capabilities, 0): " << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." << std::endl; } @@ -256,6 +280,8 @@ void TestMemoryPartitionReadWrite::Run(void) { for (int partition = RSMI_MEMORY_PARTITION_NPS1; partition <= RSMI_MEMORY_PARTITION_NPS8; partition++) { + ret_set = RSMI_STATUS_NOT_SUPPORTED; + wasSetSuccess = false; new_memory_partition = static_cast(partition); IF_VERB(STANDARD) { std::cout << std::endl; @@ -269,17 +295,46 @@ void TestMemoryPartitionReadWrite::Run(void) { << "Attempting to set memory partition to: " << memoryPartitionString(new_memory_partition) << std::endl; } - ret = rsmi_dev_memory_partition_set(dv_ind, new_memory_partition); - if (ret == RSMI_STATUS_NOT_SUPPORTED) { + + rsmi_status_t ret_caps = rsmi_dev_memory_partition_capabilities_get(dv_ind, + current_memory_capabilities, 255); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "rsmi_dev_memory_partition_capabilities_get(" << dv_ind + << ", current_memory_capabilities, 255): " + << amd::smi::getRSMIStatusString(ret_caps, false) << std::endl; + std::cout << "\t**" + << "current_memory_capabilities: " << current_memory_capabilities + << std::endl; + } + ASSERT_TRUE((ret_caps == RSMI_STATUS_NOT_SUPPORTED) || + (ret_caps == RSMI_STATUS_SUCCESS)); + + ret_set = rsmi_dev_memory_partition_set(dv_ind, new_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" << "rsmi_dev_memory_partition_set(" + << dv_ind << " , " << memoryPartitionString(new_memory_partition) << "): " + << amd::smi::getRSMIStatusString(ret_set, false) << "\n"; + } + if (ret_set == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**" << ": " << "Not supported on this machine" << std::endl; } break; } else { - CHK_ERR_ASRT(ret) + ASSERT_TRUE((ret_set == RSMI_STATUS_SUCCESS) + || (ret_set == RSMI_STATUS_BUSY) + || (ret_set == RSMI_STATUS_AMDGPU_RESTART_ERR) + || (ret_set == RSMI_STATUS_INVALID_ARGS) + || (ret_set == RSMI_STATUS_NOT_SUPPORTED)); } - if (ret != RSMI_STATUS_SUCCESS) { // do not continue trying to reset + IF_VERB(STANDARD) { + std::cout << "\t**" << "rsmi_dev_memory_partition_set(" + << dv_ind << " , " << memoryPartitionString(new_memory_partition) << "): " + << amd::smi::getRSMIStatusString(ret_set, false) << "\n"; + } + if (ret_set == RSMI_STATUS_SUCCESS) { // do not continue trying to reset wasSetSuccess = true; } @@ -291,17 +346,25 @@ void TestMemoryPartitionReadWrite::Run(void) { << "Current memory partition: " << current_memory_partition << std::endl; } - ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); - ASSERT_STREQ(memoryPartitionString(new_memory_partition).c_str(), + if (wasSetSuccess) { + ASSERT_EQ(RSMI_STATUS_SUCCESS, ret_set); + ASSERT_STREQ(memoryPartitionString(new_memory_partition).c_str(), + current_memory_partition); + CHK_ERR_ASRT(ret_set) + } else { + ASSERT_NE(RSMI_STATUS_SUCCESS, ret_set); + ASSERT_STRNE(memoryPartitionString(new_memory_partition).c_str(), current_memory_partition); + } } - /* TEST RETURN TO BOOT MEMORY PARTITION SETTING */ + /* TEST RETURN TO ORIGINAL MEMORY PARTITION SETTING */ IF_VERB(STANDARD) { std::cout << std::endl; std::cout << "\t**" - << "=========== TEST RETURN TO BOOT MEMORY PARTITION " - << "SETTING ========" << std::endl; + << "=========== TEST RETURN TO ORIGINAL MEMORY PARTITION " + << "SETTING ( " << orig_memory_partition + << " ) ========" << std::endl; } std::string oldMode = current_memory_partition; @@ -312,31 +375,7 @@ void TestMemoryPartitionReadWrite::Run(void) { << "Current memory partition: " << current_memory_partition << std::endl; } - if (wasSetSuccess) { - ASSERT_STRNE(oldMode.c_str(), current_memory_partition); - IF_VERB(STANDARD) { - std::cout << "\t**" - << "Confirmed prior memory partition (" << oldMode << ") is " - << "not equal to current memory partition (" - << current_memory_partition << ")" << std::endl; - } - } else { - ASSERT_STREQ(oldMode.c_str(), current_memory_partition); - IF_VERB(STANDARD) { - std::cout << "\t**" - << "Confirmed prior memory partition (" << oldMode << ") is " - << "equal to current memory partition (" - << current_memory_partition << ")" << std::endl; - } - } - /* TEST RETURN TO ORIGINAL MEMORY PARTITION SETTING */ - IF_VERB(STANDARD) { - std::cout << std::endl; - std::cout << "\t**" - << "=========== TEST RETURN TO ORIGINAL MEMORY PARTITION " - << "SETTING ========" << std::endl; - } new_memory_partition = mapStringToRSMIMemoryPartitionTypes.at(orig_memory_partition); IF_VERB(STANDARD) { @@ -344,6 +383,12 @@ void TestMemoryPartitionReadWrite::Run(void) { << memoryPartitionString(new_memory_partition) << std::endl; } ret = rsmi_dev_memory_partition_set(dv_ind, new_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "rsmi_dev_memory_partition_set(" << dv_ind + << ", " << orig_memory_partition << "): " + << amd::smi::getRSMIStatusString(ret, false) << std::endl; + } CHK_ERR_ASRT(ret) ret = rsmi_dev_memory_partition_get(dv_ind, current_memory_partition, 255); CHK_ERR_ASRT(ret) @@ -355,6 +400,12 @@ void TestMemoryPartitionReadWrite::Run(void) { << std::endl; } ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); - ASSERT_STREQ(memoryPartitionString(new_memory_partition).c_str(), current_memory_partition); + ASSERT_STREQ(orig_memory_partition, current_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior memory partition (" << orig_memory_partition + << ") is equal to current memory partition (" + << current_memory_partition << ")" << std::endl; + } } }