From e4daa77c08179ef90391002f25c52fe5c4f66458 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 2 Mar 2023 15:42:00 -0600 Subject: [PATCH] Filter out the GPUs not assigned to a container in showpid The process ids of other container are still visible in the sysfs file, filter it out to prevent crash. Change-Id: I665912cd09c606804186aff8cba5c24f5e58ded7 --- src/rocm_smi.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 694015f1..98dd9bf4 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -3277,11 +3277,23 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, uint32_t i = 0; amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + // filter out the devices not visible to container + auto& nodes = smi.kfd_node_map(); + for (auto nit = gpu_set.begin(); nit != gpu_set.end();) { + uint64_t gpu_id_val = (*nit); + auto kfd_ite = nodes.find(gpu_id_val); + if (kfd_ite == nodes.end()) { + nit = gpu_set.erase(nit); + } else { + nit++; + } + } + if (dv_indices != nullptr) { for (auto it = gpu_set.begin(); i < *num_devices && it != gpu_set.end(); ++it, ++i) { uint64_t gpu_id_val = (*it); - dv_indices[i] = smi.kfd_node_map()[gpu_id_val]->amdgpu_dev_index(); + dv_indices[i] = nodes[gpu_id_val]->amdgpu_dev_index(); } }