Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sw/nic/gpuagent/api/gpu_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ aga_gpu_info_from_entry (void *entry, void *ctxt)
// some API operation is in progress on this object, skip it
return false;
}
if (gpu->is_parent_gpu()) {
// no need to show parent GPUs as they are placeholders
return false;
}
memset(&info, 0, sizeof(aga_gpu_info_t));
// call entry read
gpu->read(&info);
Expand Down
18 changes: 18 additions & 0 deletions sw/nic/gpuagent/api/include/aga_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,24 @@ typedef struct aga_gpu_info_s {
aga_gpu_stats_t stats;
} aga_gpu_info_t;

/// GPU profile
typedef struct aga_gpu_profile_s {
/// GPU id
uint32_t id;
/// GPU uuid
aga_obj_key_t key;
/// GPU handle
aga_gpu_handle_t handle;
/// is the GPU partition capable
bool partition_capable;
/// GPU compute partition type
aga_gpu_compute_partition_type_t compute_partition;
/// GPU compute partition id
uint32_t partition_id;
/// GPU memory partition type
aga_gpu_memory_partition_type_t memory_partition;
} aga_gpu_profile_t;

/// device type enum
typedef enum aga_device_type_e {
AGA_DEVICE_TYPE_NONE = 0,
Expand Down
183 changes: 115 additions & 68 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -745,59 +745,6 @@ smi_fill_vram_status_ (aga_gpu_handle_t gpu_handle,
return SDK_RET_OK;
}

sdk_ret_t
smi_get_gpu_partition_info (aga_gpu_handle_t gpu_handle, bool *capable,
aga_gpu_compute_partition_type_t *compute_partition,
aga_gpu_memory_partition_type_t *memory_partition)
{
amdsmi_status_t amdsmi_ret;
amdsmi_gpu_metrics_t metrics_info = {};
char partition_type[AGA_MAX_STR_LEN + 1];

*capable = true;
*compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_NONE;
*memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NONE;
// to deduce partition capability of platform, we rely on
// metrics field num_partition of a GPU field to be 0xffff
// on partition supported platform, this api is not supported
// for paritioned GPU other than index 0 or first_handle
// we mark the capablity to true on such cases to specify platform
// partition capability
amdsmi_ret = amdsmi_get_gpu_metrics_info(gpu_handle,
&metrics_info);
if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
gpu_handle, amdsmi_ret);
} else {
if ((metrics_info.num_partition & 0xffff) ==
AMDSMI_INVALID_PARTITION_COUNT) {
// this is unsupported platform like Mi2xx
*capable = false;
}
}
// fill compute partition type
amdsmi_ret = amdsmi_get_gpu_compute_partition(gpu_handle,
partition_type, AGA_MAX_STR_LEN + 1);
if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to get compute partition for GPU {}, err {}",
gpu_handle, amdsmi_ret);
} else {
*compute_partition =
smi_to_aga_gpu_compute_partition_type(partition_type);
}
// fill memory partition type
amdsmi_ret = amdsmi_get_gpu_memory_partition(gpu_handle,
partition_type, AGA_MAX_STR_LEN + 1);
if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to get memory partition for GPU {}, err {}",
gpu_handle, amdsmi_ret);
} else {
*memory_partition =
smi_to_aga_gpu_memory_partition_type(partition_type);
}
return SDK_RET_OK;
}

sdk_ret_t
smi_get_gpu_partition_id (aga_gpu_handle_t gpu_handle, uint32_t *partition_id)
{
Expand Down Expand Up @@ -1611,6 +1558,39 @@ smi_gpu_fill_device_topology (aga_gpu_handle_t gpu_handle,
return SDK_RET_OK;
}

/// \brief function to get parent aga_obj_key_t for a given GPU
/// \param[in] gpu_handle GPU handle
/// \param[out] key aga_obj_key_t of the parent GPU
sdk_ret_t
smi_get_parent_gpu_uuid (aga_gpu_handle_t gpu_handle, aga_obj_key_t *parent_key)
{
amdsmi_bdf_t pcie_bdf;
amdsmi_status_t status;
amdsmi_asic_info_t asic_info;

// get device BDF
status = amdsmi_get_gpu_device_bdf(gpu_handle, &pcie_bdf);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to get PCIe BDF of GPU {}, err {}",
gpu_handle, status);
return amdsmi_ret_to_sdk_ret(status);
}
// get GPU asic info
status = amdsmi_get_gpu_asic_info(gpu_handle, &asic_info);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to get ASIC information of GPU {}, err {}",
gpu_handle, status);
return amdsmi_ret_to_sdk_ret(status);
}
// compute the parent GPU uuid
parent_key->reset();
memcpy(&parent_key->id[0], asic_info.asic_serial, 4);
*(uint16_t *)(&parent_key->id[6]) = 0x1000;
*(uint64_t *)(&parent_key->id[8]) = pcie_bdf.as_uint;

return SDK_RET_OK;
}

/// \brief function to get aga_obj_key_t for a given GPU
/// \param[in] gpu_handle GPU handle
/// \param[out] key aga_obj_key_t of the GPU
Expand Down Expand Up @@ -1645,9 +1625,80 @@ smi_gpu_uuid_get (aga_gpu_handle_t gpu_handle, aga_obj_key_t *key)
return SDK_RET_OK;
}

/// \brief function to get profile of a given GPU
/// \param[in] id GPU id
/// \param[in] handle GPU handle
/// \param[out] gpu GPU profile
static sdk_ret_t
smi_fill_gpu_profile_ (uint32_t id, aga_gpu_handle_t handle,
aga_gpu_profile_t& gpu)
{
sdk_ret_t ret;
amdsmi_status_t status;
char partition_type[AGA_MAX_STR_LEN + 1];
amdsmi_accelerator_partition_profile_t profile;
uint32_t partition_id[AMDSMI_MAX_ACCELERATOR_PARTITIONS] = { 0 };

gpu.id = id;
gpu.handle = handle;
// get GPU uuid
ret = smi_gpu_uuid_get(gpu.handle, &gpu.key);
if (ret != SDK_RET_OK) {
AGA_TRACE_ERR("GPU discovery failed due to error in getting UUID of "
"GPU {}", gpu.handle);
return ret;
}
// get accelerator partition profile
status = amdsmi_get_gpu_accelerator_partition_profile(gpu.handle, &profile,
partition_id);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
if (status == AMDSMI_STATUS_NOT_SUPPORTED) {
// set to default values; this could be a partition as this API is
// not supported on partitions
gpu.compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_NONE;
gpu.partition_id = partition_id[0];
gpu.partition_capable = true;
} else {
AGA_TRACE_ERR("GPU discovery failed due to error in getting "
"partition profile of GPU {}, err {}", gpu.key.str(),
status);
return amdsmi_ret_to_sdk_ret(status);
}
} else {
gpu.compute_partition =
smi_to_aga_gpu_compute_partition_type(profile.profile_type);
gpu.partition_id = partition_id[0];
if (profile.num_partitions == std::numeric_limits<uint32_t>::max()) {
// num partitions is set to the max uint32_t value for GPUs which
// don't support partitions
gpu.partition_capable = false;
} else {
gpu.partition_capable = true;
}
}
// set memory partition type
status = amdsmi_get_gpu_memory_partition(gpu.handle, partition_type,
AGA_MAX_STR_LEN + 1);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
if (status == AMDSMI_STATUS_NOT_SUPPORTED) {
// set to default values; this could be a partition as this API is
// not supported on partitions
gpu.memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NONE;
} else {
AGA_TRACE_ERR("GPU discovery failed due to error in getting "
"memory partition of GPU {}, err {}", gpu.key.str(),
status);
return amdsmi_ret_to_sdk_ret(status);
}
} else {
gpu.memory_partition =
smi_to_aga_gpu_memory_partition_type(partition_type);
}
return SDK_RET_OK;
}

sdk_ret_t
smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles,
aga_obj_key_t *gpu_keys)
smi_discover_gpus (uint32_t *num_gpu, aga_gpu_profile_t *gpu)
{
sdk_ret_t ret;
uint32_t num_procs;
Expand All @@ -1657,10 +1708,10 @@ smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles,
amdsmi_socket_handle socket_handles[AGA_MAX_SOCKET];
aga_gpu_handle_t proc_handles[AGA_MAX_PROCESSORS_PER_SOCKET];

if (!num_gpus) {
if (!num_gpu) {
return SDK_RET_ERR;
}
*num_gpus = 0;
*num_gpu = 0;
// get the socket count available in the system
status = amdsmi_get_socket_handles(&num_sockets, NULL);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
Expand Down Expand Up @@ -1704,18 +1755,14 @@ smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles,
return amdsmi_ret_to_sdk_ret(status);
}
if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
gpu_handles[*num_gpus] = proc_handles[j];
if (gpu_keys) {
ret = smi_gpu_uuid_get(proc_handles[j],
&gpu_keys[*num_gpus]);
if (ret != SDK_RET_OK) {
AGA_TRACE_ERR("GPU discovery failed due to error in "
"getting UUID of GPU {}",
proc_handles[j]);
return ret;
}
ret = smi_fill_gpu_profile_(*num_gpu, proc_handles[j],
gpu[*num_gpu]);
if (ret != SDK_RET_OK) {
AGA_TRACE_ERR("GPU discovery failed when processing GPU {}",
proc_handles[j]);
return ret;
}
(*num_gpus)++;
(*num_gpu)++;
}
}
}
Expand Down
6 changes: 5 additions & 1 deletion sw/nic/gpuagent/api/smi/amdsmi/smi_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,7 @@ sdk_ret_t
smi_state::init(aga_api_init_params_t *init_params) {
sdk_ret_t ret;
amdsmi_status_t status;
aga_gpu_profile_t gpu[AGA_MAX_GPU];

// initialize smi library
status = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
Expand All @@ -1516,10 +1517,13 @@ smi_state::init(aga_api_init_params_t *init_params) {
return amdsmi_ret_to_sdk_ret(status);
}
// discover gpus
ret = aga::smi_discover_gpus(&num_gpu_, gpu_handles_, NULL);
ret = aga::smi_discover_gpus(&num_gpu_, gpu);
if (ret != SDK_RET_OK) {
return ret;
}
for (uint32_t i = 0; i < num_gpu_; i++) {
gpu_handles_[i] = gpu[i].handle;
}
// spawn event monitor thread
spawn_event_monitor_thread_();
// spawn watcher thread
Expand Down
19 changes: 12 additions & 7 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,24 +357,29 @@ smi_to_aga_gpu_page_status (amdsmi_memory_page_status_t page_status)
return AGA_GPU_PAGE_STATUS_NONE;
}

/// \brief convert amdsmi partition type string to aga comptue partition type
/// \brief convert amdsmi accelerator partition to aga comptue partition type
/// \param[in] partition_type amdsmi partition type string
/// \return aga gpu compute partition type
static inline aga_gpu_compute_partition_type_t
smi_to_aga_gpu_compute_partition_type (std::string partition_type)
smi_to_aga_gpu_compute_partition_type (
amdsmi_accelerator_partition_type_t partition_type)
{
if (partition_type == "SPX") {
switch (partition_type) {
case AMDSMI_ACCELERATOR_PARTITION_SPX:
return AGA_GPU_COMPUTE_PARTITION_TYPE_SPX;
} else if (partition_type == "DPX") {
case AMDSMI_ACCELERATOR_PARTITION_DPX:
return AGA_GPU_COMPUTE_PARTITION_TYPE_DPX;
} else if (partition_type == "TPX") {
case AMDSMI_ACCELERATOR_PARTITION_TPX:
return AGA_GPU_COMPUTE_PARTITION_TYPE_TPX;
} else if (partition_type == "QPX") {
case AMDSMI_ACCELERATOR_PARTITION_QPX:
return AGA_GPU_COMPUTE_PARTITION_TYPE_QPX;
} else if (partition_type == "CPX") {
case AMDSMI_ACCELERATOR_PARTITION_CPX:
return AGA_GPU_COMPUTE_PARTITION_TYPE_CPX;
default:
break;
}


return AGA_GPU_COMPUTE_PARTITION_TYPE_NONE;
}

Expand Down
Loading