Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions sw/nic/gpuagent/api/gpu_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,59 @@ aga_gpu_delete (_In_ aga_obj_key_t *key)
{
return aga_gpu_api_handle(API_OP_DELETE, key, NULL);
}

typedef struct aga_gpu_cper_read_args_s {
void *ctxt;
aga_cper_severity_t severity;
gpu_cper_read_cb_t cb;
} aga_gpu_cper_read_args_t;

static bool
aga_gpu_cper_info_from_entry (void *entry, void *ctxt)
{
sdk_ret_t ret;
aga_cper_info_t info = {};
gpu_entry *gpu = (gpu_entry *)entry;
aga_gpu_cper_read_args_t *args = (aga_gpu_cper_read_args_t *)ctxt;

if (gpu->in_use()) {
// some API operation is in progress on this object, skip it
return false;
}
if (gpu->is_parent_gpu()) {
// partition parent GPU objects can be skipped
return false;
}
// set GPU id
info.gpu = gpu->key();
// get CPER information
ret = aga::smi_gpu_get_cper_entries(gpu->handle(), args->severity, &info);
if (ret != SDK_RET_OK) {
goto done;
}
// call cb on info
args->cb(&info, args->ctxt);
done:
return false;
}

sdk_ret_t
aga_gpu_cper_read (aga_obj_key_t *key, aga_cper_severity_t severity,
gpu_cper_read_cb_t cb, void *ctxt)
{
gpu_entry *gpu;
aga_gpu_cper_read_args_t args = { 0 };

args.ctxt = ctxt;
args.severity = severity;
args.cb = cb;
if (*key == k_aga_obj_key_invalid) {
return gpu_db()->walk(aga_gpu_cper_info_from_entry, &args);
} else {
gpu = gpu_db()->find(key);
if (gpu) {
return aga_gpu_cper_info_from_entry(gpu, &args);
}
}
return SDK_RET_ENTRY_NOT_FOUND;
}
88 changes: 88 additions & 0 deletions sw/nic/gpuagent/api/include/aga_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ limitations under the License.
#define AGA_GPU_MAX_BAD_PAGE_RECORD 64
#define AGA_GPU_INVALID_PARTITION_ID 0xFFFFFFFF
#define AGA_GPU_MAX_PARTITION 8
#define AGA_GPU_MAX_CPER_ENTRY 128
#define AGA_GPU_MAX_AF_ID_PER_CPER 12

/// number of clocks that can not be configured - AGA_GPU_CLOCK_TYPE_FABRIC,
/// AGA_GPU_CLOCK_TYPE_SOC (4), AGA_GPU_CLOCK_TYPE_DCE, AGA_GPU_CLOCK_TYPE_PCIE
#define AGA_GPU_NUM_NON_CFG_CLOCK_TYPES 7
Expand Down Expand Up @@ -793,6 +796,78 @@ typedef struct aga_gpu_memory_partition_info_s {
aga_gpu_memory_partition_type_t partition_type;
} aga_gpu_memory_partition_info_t;

/// CPER severity
typedef enum aga_cper_severity_e {
/// invalid severity
AGA_CPER_SEVERITY_NONE = 0,
/// non-fatal uncorrected errors
AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED = 1,
/// fatal errors
AGA_CPER_SEVERITY_FATAL = 2,
/// non-fatal corrected errors
AGA_CPER_SEVERITY_NON_FATAL_CORRECTED = 3,
} aga_cper_severity_t;

/// CPER notification type
typedef enum aga_cper_notification_type_e {
/// invalid notification type
AGA_CPER_NOTIFICATION_TYPE_NONE = 0,
/// Corrected Memory Check (CMC)
AGA_CPER_NOTIFICATION_TYPE_CMC = 1,
/// Corrected Platform Error (CPE)
AGA_CPER_NOTIFICATION_TYPE_CPE = 2,
/// Machine Check Exception (MCE)
AGA_CPER_NOTIFICATION_TYPE_MCE = 3,
/// PCI express error
AGA_CPER_NOTIFICATION_TYPE_PCIE = 4,
/// initialization error
AGA_CPER_NOTIFICATION_TYPE_INIT = 5,
/// Non-Maskable Interrupt (NMI)
AGA_CPER_NOTIFICATION_TYPE_NMI = 6,
/// boot error
AGA_CPER_NOTIFICATION_TYPE_BOOT = 7,
/// Direct Memory Access Remapping (DMAR) error
AGA_CPER_NOTIFICATION_TYPE_DMAR = 8,
/// System Error Architecture (SEA)
AGA_CPER_NOTIFICATION_TYPE_SEA = 9,
/// System Error Interface (SEI)
AGA_CPER_NOTIFICATION_TYPE_SEI = 10,
/// Platform Error Interface (PEI)
AGA_CPER_NOTIFICATION_TYPE_PEI = 11,
/// Compute Express Link component error
AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT = 12,
} aga_cper_notification_type_t;

/// CPER entry information
typedef struct aga_cper_entry_s {
/// CPER entry identifier
std::string record_id;
/// CPER error severity
aga_cper_severity_t severity;
/// CPER format revision
uint32_t revision;
/// CPER error timestamp
std::string timestamp;
/// CPER entry creator identifier
std::string creator_id;
/// CPER entry notification type
aga_cper_notification_type_t notification_type;
/// number of AMD field ids
uint32_t num_af_id;
/// AMD field ids
uint64_t af_id[AGA_GPU_MAX_AF_ID_PER_CPER];
} aga_cper_entry_t;

/// CPER information
typedef struct aga_cper_info_s {
/// GPU uuid
aga_obj_key_t gpu;
/// number of cper entries
uint32_t num_cper_entry;
/// cper entries
aga_cper_entry_t cper_entry[AGA_GPU_MAX_CPER_ENTRY];
} aga_cper_info_t;

/// \brief create gpu
/// \param[in] spec config specification
/// \return #SDK_RET_OK on success, failure status code on error
Expand Down Expand Up @@ -896,4 +971,17 @@ sdk_ret_t aga_gpu_update(_In_ aga_gpu_spec_t *spec);
/// \return #SDK_RET_OK on success, failure status code on error
sdk_ret_t aga_gpu_delete(_In_ aga_obj_key_t *key);

typedef void (*gpu_cper_read_cb_t)(aga_cper_info_t *info, void *ctxt);

/// \brief read gpu CPER records
/// \param[in] key key of the gpu object, if k_aga_obj_key_invalid we read
/// CPER records of all gpu
/// \param[in] cb callback function
/// \param[in] ctxt opaque context passed to cb
/// \return #SDK_RET_OK on success, failure status code on error
sdk_ret_t aga_gpu_cper_read(_In_ aga_obj_key_t *key,
_In_ aga_cper_severity_t severity,
_In_ gpu_cper_read_cb_t gpu_cper_read_cb,
_In_ void *ctxt);

#endif /// __API_INCLUDE_AGA_GPU_HPP__
111 changes: 111 additions & 0 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ limitations under the License.
///
//----------------------------------------------------------------------------

#include <sstream>
#include <iomanip>
extern "C" {
#include "nic/third-party/rocm/amd_smi_lib/include/amd_smi/amdsmi.h"
}
Expand All @@ -42,6 +44,8 @@ namespace aga {
#define AMDSMI_INVALID_UINT64 0xffffffffffffffff
#define AMDSMI_DEEP_SLEEP_THRESHOLD 140
#define AMDSMI_COUNTER_RESOLUTION 15.3
#define CPER_BUF_SIZE (4 * 1024 * 1024) // 4 MB


/// cache GPU metrics so that we don't do repeated calls while filling spec,
/// status and statistics
Expand Down Expand Up @@ -1729,4 +1733,111 @@ smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec,
return SDK_RET_OK;
}

static inline std::string
timestamp_string_from_cper_timestamp (amdsmi_cper_timestamp_t *ts)
{
uint32_t full_year;
std::ostringstream oss;

// assuming year is offset from 2000
full_year = 2000 + ts->year;

oss << std::setfill('0') << std::setw(4) << full_year << "-"
<< std::setw(2) << static_cast<int>(ts->month) << "-"
<< std::setw(2) << static_cast<int>(ts->day) << " "
<< std::setw(2) << static_cast<int>(ts->hours) << ":"
<< std::setw(2) << static_cast<int>(ts->minutes) << ":"
<< std::setw(2) << static_cast<int>(ts->seconds);

return oss.str();
}

sdk_ret_t
smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle,
aga_cper_severity_t severity, aga_cper_info_t *info)
{
char *cper_data;
char *cper_buffer;
uint64_t cursor = 0;
uint32_t severity_mask;
amdsmi_status_t afid_status;
uint64_t total_cper_entries = 0;
uint64_t buf_size = CPER_BUF_SIZE;
uint32_t prev_cper_record_size = 0;
uint64_t num_cper_hdr = AGA_GPU_MAX_CPER_ENTRY;
amdsmi_status_t status = AMDSMI_STATUS_MORE_DATA;
amdsmi_cper_hdr_t *cper_hdrs[AGA_GPU_MAX_CPER_ENTRY];

// set severity mask
switch (severity) {
case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED:
severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED);
break;
case AGA_CPER_SEVERITY_FATAL:
severity_mask = (1 << AMDSMI_CPER_SEV_FATAL);
break;
case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED:
severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED);
break;
default:
severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED) |
(1 << AMDSMI_CPER_SEV_FATAL) |
(1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED);
break;
}
// allocate memory for CPER data
cper_data = (char *)malloc(buf_size);
// cper_buffer is used to keep track of each individual record
cper_buffer = cper_data;
while (status == AMDSMI_STATUS_MORE_DATA) {
// get CPER entries
status = amdsmi_get_gpu_cper_entries(gpu_handle, severity_mask,
cper_data, &buf_size, cper_hdrs, &num_cper_hdr, &cursor);
if ((status != AMDSMI_STATUS_SUCCESS) &&
(status != AMDSMI_STATUS_MORE_DATA)) {
AGA_TRACE_ERR("Failed to get CPER entries for GPU {}, err {}",
gpu_handle, status);
// free allocated memory
free(cper_data);
return amdsmi_ret_to_sdk_ret(status);
}
for (uint64_t i = 0;
i < num_cper_hdr && total_cper_entries < AGA_GPU_MAX_CPER_ENTRY;
i++, total_cper_entries++) {
auto cper_entry = &info->cper_entry[info->num_cper_entry++];

cper_entry->record_id = std::string(cper_hdrs[i]->record_id);
cper_entry->severity =
smi_to_aga_cper_severity(cper_hdrs[i]->error_severity);
cper_entry->revision = cper_hdrs[i]->revision;
if (cper_hdrs[i]->cper_valid_bits.valid_bits.timestamp) {
cper_entry->timestamp =
timestamp_string_from_cper_timestamp(
&cper_hdrs[i]->timestamp);
}
cper_entry->creator_id = std::string(cper_hdrs[i]->creator_id);
cper_entry->notification_type =
smi_to_aga_cper_notification_type(cper_hdrs[i]->notify_type);
// get AMD field ids from the cper record
cper_buffer += prev_cper_record_size;
// initialize num_af_id to be the size of the array
cper_entry->num_af_id = AGA_GPU_MAX_AF_ID_PER_CPER;
afid_status = amdsmi_get_afids_from_cper(cper_buffer,
cper_hdrs[i]->record_length, cper_entry->af_id,
&cper_entry->num_af_id);
if (afid_status != AMDSMI_STATUS_SUCCESS) {
cper_entry->num_af_id = 0;
AGA_TRACE_ERR("Failed to get AMD field id for CPER entry for "
"GPU {}, err {}", gpu_handle, status);
}
// update prev_cper_record_size
prev_cper_record_size = cper_hdrs[i]->record_length;
}
}

// free allocated memory
free(cper_data);
return SDK_RET_OK;
}

} // namespace aga
69 changes: 69 additions & 0 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,75 @@ aga_to_smi_gpu_memory_partition_type (
return AMDSMI_MEMORY_PARTITION_UNKNOWN;
}

/// \brief convert amdsmi CPER severity to aga CPER severity
/// \param[in] amdsmi CPER severity
/// \return aga CPER severity
static inline aga_cper_severity_t
smi_to_aga_cper_severity (amdsmi_cper_sev_t severity)
{
switch (severity) {
case AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED:
return AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED;
case AMDSMI_CPER_SEV_FATAL:
return AGA_CPER_SEVERITY_FATAL;
case AMDSMI_CPER_SEV_NON_FATAL_CORRECTED:
return AGA_CPER_SEVERITY_NON_FATAL_CORRECTED;
default:
break;
}

return AGA_CPER_SEVERITY_NONE;
}

/// \brief convert amdsmi CPER notification type to aga CPER notification type
/// \param[in] amdsmi CPER notification type in amdsmi_cper_guid_t format
/// \return aga CPER notification type
static inline aga_cper_notification_type_t
smi_to_aga_cper_notification_type (amdsmi_cper_guid_t ntfn_type)
{
uint64_t amdsmi_ntfn_type;

amdsmi_ntfn_type = (uint64_t)ntfn_type.b[0] |
((uint64_t)ntfn_type.b[1] << 8) |
((uint64_t)ntfn_type.b[2] << 16) |
((uint64_t)ntfn_type.b[3] << 24) |
((uint64_t)ntfn_type.b[4] << 32) |
((uint64_t)ntfn_type.b[5] << 40) |
((uint64_t)ntfn_type.b[6] << 48) |
((uint64_t)ntfn_type.b[7] << 56);

switch (amdsmi_ntfn_type) {
case AMDSMI_CPER_NOTIFY_TYPE_CMC:
return AGA_CPER_NOTIFICATION_TYPE_CMC;
case AMDSMI_CPER_NOTIFY_TYPE_CPE:
return AGA_CPER_NOTIFICATION_TYPE_CPE;
case AMDSMI_CPER_NOTIFY_TYPE_MCE:
return AGA_CPER_NOTIFICATION_TYPE_MCE;
case AMDSMI_CPER_NOTIFY_TYPE_PCIE:
return AGA_CPER_NOTIFICATION_TYPE_PCIE;
case AMDSMI_CPER_NOTIFY_TYPE_INIT:
return AGA_CPER_NOTIFICATION_TYPE_INIT;
case AMDSMI_CPER_NOTIFY_TYPE_NMI:
return AGA_CPER_NOTIFICATION_TYPE_NMI;
case AMDSMI_CPER_NOTIFY_TYPE_BOOT:
return AGA_CPER_NOTIFICATION_TYPE_BOOT;
case AMDSMI_CPER_NOTIFY_TYPE_DMAR:
return AGA_CPER_NOTIFICATION_TYPE_DMAR;
case AMDSMI_CPER_NOTIFY_TYPE_SEA:
return AGA_CPER_NOTIFICATION_TYPE_SEA;
case AMDSMI_CPER_NOTIFY_TYPE_SEI:
return AGA_CPER_NOTIFICATION_TYPE_SEI;
case AMDSMI_CPER_NOTIFY_TYPE_PEI:
return AGA_CPER_NOTIFICATION_TYPE_PEI;
case AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT:
return AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT;
default:
break;
}

return AGA_CPER_NOTIFICATION_TYPE_NONE;
}

/// \brief convert amdsmi return status to sdk return status
/// \param[in] amdsmi_ret amdsmi return status
/// \return sdk return status
Expand Down
Loading