diff --git a/sw/nic/gpuagent/api/gpu_api.cc b/sw/nic/gpuagent/api/gpu_api.cc index 568662c..f2dda4d 100644 --- a/sw/nic/gpuagent/api/gpu_api.cc +++ b/sw/nic/gpuagent/api/gpu_api.cc @@ -398,3 +398,59 @@ aga_gpu_delete (_In_ aga_obj_key_t *key) { return aga_gpu_api_handle(API_OP_DELETE, key, NULL); } + +typedef struct aga_gpu_cper_read_args_s { + void *ctxt; + aga_cper_severity_t severity; + gpu_cper_read_cb_t cb; +} aga_gpu_cper_read_args_t; + +static bool +aga_gpu_cper_info_from_entry (void *entry, void *ctxt) +{ + sdk_ret_t ret; + aga_cper_info_t info = {}; + gpu_entry *gpu = (gpu_entry *)entry; + aga_gpu_cper_read_args_t *args = (aga_gpu_cper_read_args_t *)ctxt; + + if (gpu->in_use()) { + // some API operation is in progress on this object, skip it + return false; + } + if (gpu->is_parent_gpu()) { + // partition parent GPU objects can be skipped + return false; + } + // set GPU id + info.gpu = gpu->key(); + // get CPER information + ret = aga::smi_gpu_get_cper_entries(gpu->handle(), args->severity, &info); + if (ret != SDK_RET_OK) { + goto done; + } + // call cb on info + args->cb(&info, args->ctxt); +done: + return false; +} + +sdk_ret_t +aga_gpu_cper_read (aga_obj_key_t *key, aga_cper_severity_t severity, + gpu_cper_read_cb_t cb, void *ctxt) +{ + gpu_entry *gpu; + aga_gpu_cper_read_args_t args = { 0 }; + + args.ctxt = ctxt; + args.severity = severity; + args.cb = cb; + if (*key == k_aga_obj_key_invalid) { + return gpu_db()->walk(aga_gpu_cper_info_from_entry, &args); + } else { + gpu = gpu_db()->find(key); + if (gpu) { + return aga_gpu_cper_info_from_entry(gpu, &args); + } + } + return SDK_RET_ENTRY_NOT_FOUND; +} diff --git a/sw/nic/gpuagent/api/include/aga_gpu.hpp b/sw/nic/gpuagent/api/include/aga_gpu.hpp index 72ccd3f..3af1b34 100644 --- a/sw/nic/gpuagent/api/include/aga_gpu.hpp +++ b/sw/nic/gpuagent/api/include/aga_gpu.hpp @@ -47,6 +47,9 @@ limitations under the License. #define AGA_GPU_MAX_BAD_PAGE_RECORD 64 #define AGA_GPU_INVALID_PARTITION_ID 0xFFFFFFFF #define AGA_GPU_MAX_PARTITION 8 +#define AGA_GPU_MAX_CPER_ENTRY 128 +#define AGA_GPU_MAX_AF_ID_PER_CPER 12 + /// number of clocks that can not be configured - AGA_GPU_CLOCK_TYPE_FABRIC, /// AGA_GPU_CLOCK_TYPE_SOC (4), AGA_GPU_CLOCK_TYPE_DCE, AGA_GPU_CLOCK_TYPE_PCIE #define AGA_GPU_NUM_NON_CFG_CLOCK_TYPES 7 @@ -793,6 +796,78 @@ typedef struct aga_gpu_memory_partition_info_s { aga_gpu_memory_partition_type_t partition_type; } aga_gpu_memory_partition_info_t; +/// CPER severity +typedef enum aga_cper_severity_e { + /// invalid severity + AGA_CPER_SEVERITY_NONE = 0, + /// non-fatal uncorrected errors + AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED = 1, + /// fatal errors + AGA_CPER_SEVERITY_FATAL = 2, + /// non-fatal corrected errors + AGA_CPER_SEVERITY_NON_FATAL_CORRECTED = 3, +} aga_cper_severity_t; + +/// CPER notification type +typedef enum aga_cper_notification_type_e { + /// invalid notification type + AGA_CPER_NOTIFICATION_TYPE_NONE = 0, + /// Corrected Memory Check (CMC) + AGA_CPER_NOTIFICATION_TYPE_CMC = 1, + /// Corrected Platform Error (CPE) + AGA_CPER_NOTIFICATION_TYPE_CPE = 2, + /// Machine Check Exception (MCE) + AGA_CPER_NOTIFICATION_TYPE_MCE = 3, + /// PCI express error + AGA_CPER_NOTIFICATION_TYPE_PCIE = 4, + /// initialization error + AGA_CPER_NOTIFICATION_TYPE_INIT = 5, + /// Non-Maskable Interrupt (NMI) + AGA_CPER_NOTIFICATION_TYPE_NMI = 6, + /// boot error + AGA_CPER_NOTIFICATION_TYPE_BOOT = 7, + /// Direct Memory Access Remapping (DMAR) error + AGA_CPER_NOTIFICATION_TYPE_DMAR = 8, + /// System Error Architecture (SEA) + AGA_CPER_NOTIFICATION_TYPE_SEA = 9, + /// System Error Interface (SEI) + AGA_CPER_NOTIFICATION_TYPE_SEI = 10, + /// Platform Error Interface (PEI) + AGA_CPER_NOTIFICATION_TYPE_PEI = 11, + /// Compute Express Link component error + AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT = 12, +} aga_cper_notification_type_t; + +/// CPER entry information +typedef struct aga_cper_entry_s { + /// CPER entry identifier + std::string record_id; + /// CPER error severity + aga_cper_severity_t severity; + /// CPER format revision + uint32_t revision; + /// CPER error timestamp + std::string timestamp; + /// CPER entry creator identifier + std::string creator_id; + /// CPER entry notification type + aga_cper_notification_type_t notification_type; + /// number of AMD field ids + uint32_t num_af_id; + /// AMD field ids + uint64_t af_id[AGA_GPU_MAX_AF_ID_PER_CPER]; +} aga_cper_entry_t; + +/// CPER information +typedef struct aga_cper_info_s { + /// GPU uuid + aga_obj_key_t gpu; + /// number of cper entries + uint32_t num_cper_entry; + /// cper entries + aga_cper_entry_t cper_entry[AGA_GPU_MAX_CPER_ENTRY]; +} aga_cper_info_t; + /// \brief create gpu /// \param[in] spec config specification /// \return #SDK_RET_OK on success, failure status code on error @@ -896,4 +971,17 @@ sdk_ret_t aga_gpu_update(_In_ aga_gpu_spec_t *spec); /// \return #SDK_RET_OK on success, failure status code on error sdk_ret_t aga_gpu_delete(_In_ aga_obj_key_t *key); +typedef void (*gpu_cper_read_cb_t)(aga_cper_info_t *info, void *ctxt); + +/// \brief read gpu CPER records +/// \param[in] key key of the gpu object, if k_aga_obj_key_invalid we read +/// CPER records of all gpu +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_cper_read(_In_ aga_obj_key_t *key, + _In_ aga_cper_severity_t severity, + _In_ gpu_cper_read_cb_t gpu_cper_read_cb, + _In_ void *ctxt); + #endif /// __API_INCLUDE_AGA_GPU_HPP__ diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc index f18f814..afbbf24 100644 --- a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc @@ -20,6 +20,8 @@ limitations under the License. /// //---------------------------------------------------------------------------- +#include +#include extern "C" { #include "nic/third-party/rocm/amd_smi_lib/include/amd_smi/amdsmi.h" } @@ -42,6 +44,8 @@ namespace aga { #define AMDSMI_INVALID_UINT64 0xffffffffffffffff #define AMDSMI_DEEP_SLEEP_THRESHOLD 140 #define AMDSMI_COUNTER_RESOLUTION 15.3 +#define CPER_BUF_SIZE (4 * 1024 * 1024) // 4 MB + /// cache GPU metrics so that we don't do repeated calls while filling spec, /// status and statistics @@ -1729,4 +1733,111 @@ smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, return SDK_RET_OK; } +static inline std::string +timestamp_string_from_cper_timestamp (amdsmi_cper_timestamp_t *ts) +{ + uint32_t full_year; + std::ostringstream oss; + + // assuming year is offset from 2000 + full_year = 2000 + ts->year; + + oss << std::setfill('0') << std::setw(4) << full_year << "-" + << std::setw(2) << static_cast(ts->month) << "-" + << std::setw(2) << static_cast(ts->day) << " " + << std::setw(2) << static_cast(ts->hours) << ":" + << std::setw(2) << static_cast(ts->minutes) << ":" + << std::setw(2) << static_cast(ts->seconds); + + return oss.str(); +} + +sdk_ret_t +smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, aga_cper_info_t *info) +{ + char *cper_data; + char *cper_buffer; + uint64_t cursor = 0; + uint32_t severity_mask; + amdsmi_status_t afid_status; + uint64_t total_cper_entries = 0; + uint64_t buf_size = CPER_BUF_SIZE; + uint32_t prev_cper_record_size = 0; + uint64_t num_cper_hdr = AGA_GPU_MAX_CPER_ENTRY; + amdsmi_status_t status = AMDSMI_STATUS_MORE_DATA; + amdsmi_cper_hdr_t *cper_hdrs[AGA_GPU_MAX_CPER_ENTRY]; + + // set severity mask + switch (severity) { + case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED); + break; + case AGA_CPER_SEVERITY_FATAL: + severity_mask = (1 << AMDSMI_CPER_SEV_FATAL); + break; + case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + default: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED) | + (1 << AMDSMI_CPER_SEV_FATAL) | + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + } + // allocate memory for CPER data + cper_data = (char *)malloc(buf_size); + // cper_buffer is used to keep track of each individual record + cper_buffer = cper_data; + while (status == AMDSMI_STATUS_MORE_DATA) { + // get CPER entries + status = amdsmi_get_gpu_cper_entries(gpu_handle, severity_mask, + cper_data, &buf_size, cper_hdrs, &num_cper_hdr, &cursor); + if ((status != AMDSMI_STATUS_SUCCESS) && + (status != AMDSMI_STATUS_MORE_DATA)) { + AGA_TRACE_ERR("Failed to get CPER entries for GPU {}, err {}", + gpu_handle, status); + // free allocated memory + free(cper_data); + return amdsmi_ret_to_sdk_ret(status); + } + for (uint64_t i = 0; + i < num_cper_hdr && total_cper_entries < AGA_GPU_MAX_CPER_ENTRY; + i++, total_cper_entries++) { + auto cper_entry = &info->cper_entry[info->num_cper_entry++]; + + cper_entry->record_id = std::string(cper_hdrs[i]->record_id); + cper_entry->severity = + smi_to_aga_cper_severity(cper_hdrs[i]->error_severity); + cper_entry->revision = cper_hdrs[i]->revision; + if (cper_hdrs[i]->cper_valid_bits.valid_bits.timestamp) { + cper_entry->timestamp = + timestamp_string_from_cper_timestamp( + &cper_hdrs[i]->timestamp); + } + cper_entry->creator_id = std::string(cper_hdrs[i]->creator_id); + cper_entry->notification_type = + smi_to_aga_cper_notification_type(cper_hdrs[i]->notify_type); + // get AMD field ids from the cper record + cper_buffer += prev_cper_record_size; + // initialize num_af_id to be the size of the array + cper_entry->num_af_id = AGA_GPU_MAX_AF_ID_PER_CPER; + afid_status = amdsmi_get_afids_from_cper(cper_buffer, + cper_hdrs[i]->record_length, cper_entry->af_id, + &cper_entry->num_af_id); + if (afid_status != AMDSMI_STATUS_SUCCESS) { + cper_entry->num_af_id = 0; + AGA_TRACE_ERR("Failed to get AMD field id for CPER entry for " + "GPU {}, err {}", gpu_handle, status); + } + // update prev_cper_record_size + prev_cper_record_size = cper_hdrs[i]->record_length; + } + } + + // free allocated memory + free(cper_data); + return SDK_RET_OK; +} + } // namespace aga diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp b/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp index 2223521..7bc6d53 100644 --- a/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp @@ -444,6 +444,75 @@ aga_to_smi_gpu_memory_partition_type ( return AMDSMI_MEMORY_PARTITION_UNKNOWN; } +/// \brief convert amdsmi CPER severity to aga CPER severity +/// \param[in] amdsmi CPER severity +/// \return aga CPER severity +static inline aga_cper_severity_t +smi_to_aga_cper_severity (amdsmi_cper_sev_t severity) +{ + switch (severity) { + case AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED; + case AMDSMI_CPER_SEV_FATAL: + return AGA_CPER_SEVERITY_FATAL; + case AMDSMI_CPER_SEV_NON_FATAL_CORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_CORRECTED; + default: + break; + } + + return AGA_CPER_SEVERITY_NONE; +} + +/// \brief convert amdsmi CPER notification type to aga CPER notification type +/// \param[in] amdsmi CPER notification type in amdsmi_cper_guid_t format +/// \return aga CPER notification type +static inline aga_cper_notification_type_t +smi_to_aga_cper_notification_type (amdsmi_cper_guid_t ntfn_type) +{ + uint64_t amdsmi_ntfn_type; + + amdsmi_ntfn_type = (uint64_t)ntfn_type.b[0] | + ((uint64_t)ntfn_type.b[1] << 8) | + ((uint64_t)ntfn_type.b[2] << 16) | + ((uint64_t)ntfn_type.b[3] << 24) | + ((uint64_t)ntfn_type.b[4] << 32) | + ((uint64_t)ntfn_type.b[5] << 40) | + ((uint64_t)ntfn_type.b[6] << 48) | + ((uint64_t)ntfn_type.b[7] << 56); + + switch (amdsmi_ntfn_type) { + case AMDSMI_CPER_NOTIFY_TYPE_CMC: + return AGA_CPER_NOTIFICATION_TYPE_CMC; + case AMDSMI_CPER_NOTIFY_TYPE_CPE: + return AGA_CPER_NOTIFICATION_TYPE_CPE; + case AMDSMI_CPER_NOTIFY_TYPE_MCE: + return AGA_CPER_NOTIFICATION_TYPE_MCE; + case AMDSMI_CPER_NOTIFY_TYPE_PCIE: + return AGA_CPER_NOTIFICATION_TYPE_PCIE; + case AMDSMI_CPER_NOTIFY_TYPE_INIT: + return AGA_CPER_NOTIFICATION_TYPE_INIT; + case AMDSMI_CPER_NOTIFY_TYPE_NMI: + return AGA_CPER_NOTIFICATION_TYPE_NMI; + case AMDSMI_CPER_NOTIFY_TYPE_BOOT: + return AGA_CPER_NOTIFICATION_TYPE_BOOT; + case AMDSMI_CPER_NOTIFY_TYPE_DMAR: + return AGA_CPER_NOTIFICATION_TYPE_DMAR; + case AMDSMI_CPER_NOTIFY_TYPE_SEA: + return AGA_CPER_NOTIFICATION_TYPE_SEA; + case AMDSMI_CPER_NOTIFY_TYPE_SEI: + return AGA_CPER_NOTIFICATION_TYPE_SEI; + case AMDSMI_CPER_NOTIFY_TYPE_PEI: + return AGA_CPER_NOTIFICATION_TYPE_PEI; + case AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT: + return AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT; + default: + break; + } + + return AGA_CPER_NOTIFICATION_TYPE_NONE; +} + /// \brief convert amdsmi return status to sdk return status /// \param[in] amdsmi_ret amdsmi return status /// \return sdk return status diff --git a/sw/nic/gpuagent/api/smi/gimamdsmi/smi_api.cc b/sw/nic/gpuagent/api/smi/gimamdsmi/smi_api.cc index 193c830..76984ce 100644 --- a/sw/nic/gpuagent/api/smi/gimamdsmi/smi_api.cc +++ b/sw/nic/gpuagent/api/smi/gimamdsmi/smi_api.cc @@ -21,6 +21,8 @@ /// //---------------------------------------------------------------------------- +#include +#include extern "C" { #include "nic/third-party/rocm/gim_amd_smi_lib/include/amd_smi/amdsmi.h" } @@ -40,6 +42,7 @@ namespace aga { #define AMDSMI_DEEP_SLEEP_THRESHOLD 140 #define AMDSMI_UINT32_INVALID_VAL 0xffffffff #define AMDSMI_UINT64_INVALID_VAL 0xffffffffffffffff +#define CPER_BUF_SIZE (4 * 1024 * 1024) // 4 MB /// \brief struct to be used as ctxt when walking GPU db to build topology typedef struct gpu_topo_walk_ctxt_s { @@ -1137,4 +1140,111 @@ smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, return SDK_RET_OK; } +static inline std::string +timestamp_string_from_cper_timestamp (amdsmi_cper_timestamp_t *ts) +{ + uint32_t full_year; + std::ostringstream oss; + + // assuming year is offset from 2000 + full_year = 2000 + ts->year; + + oss << std::setfill('0') << std::setw(4) << full_year << "-" + << std::setw(2) << static_cast(ts->month) << "-" + << std::setw(2) << static_cast(ts->day) << " " + << std::setw(2) << static_cast(ts->hours) << ":" + << std::setw(2) << static_cast(ts->minutes) << ":" + << std::setw(2) << static_cast(ts->seconds); + + return oss.str(); +} + +sdk_ret_t +smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, aga_cper_info_t *info) +{ + char *cper_data; + char *cper_buffer; + uint64_t cursor = 0; + uint32_t severity_mask; + amdsmi_status_t afid_status; + uint64_t total_cper_entries = 0; + uint64_t buf_size = CPER_BUF_SIZE; + uint32_t prev_cper_record_size = 0; + uint64_t num_cper_hdr = AGA_GPU_MAX_CPER_ENTRY; + amdsmi_status_t status = AMDSMI_STATUS_MORE_DATA; + amdsmi_cper_hdr_t *cper_hdrs[AGA_GPU_MAX_CPER_ENTRY]; + + // set severity mask + switch (severity) { + case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED); + break; + case AGA_CPER_SEVERITY_FATAL: + severity_mask = (1 << AMDSMI_CPER_SEV_FATAL); + break; + case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + default: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED) | + (1 << AMDSMI_CPER_SEV_FATAL) | + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + } + // allocate memory for CPER data + cper_data = (char *)malloc(buf_size); + // cper_buffer is used to keep track of each individual record + cper_buffer = cper_data; + while (status == AMDSMI_STATUS_MORE_DATA) { + // get CPER entries + status = amdsmi_get_gpu_cper_entries(gpu_handle, severity_mask, + cper_data, &buf_size, cper_hdrs, &num_cper_hdr, &cursor); + if ((status != AMDSMI_STATUS_SUCCESS) && + (status != AMDSMI_STATUS_MORE_DATA)) { + AGA_TRACE_ERR("Failed to get CPER entries for GPU {}, err {}", + gpu_handle, status); + // free allocated memory + free(cper_data); + return amdsmi_ret_to_sdk_ret(status); + } + for (uint64_t i = 0; + i < num_cper_hdr && total_cper_entries < AGA_GPU_MAX_CPER_ENTRY ; + i++, total_cper_entries++) { + auto cper_entry = &info->cper_entry[info->num_cper_entry++]; + + cper_entry->record_id = std::string(cper_hdrs[i]->record_id); + cper_entry->severity = + smi_to_aga_cper_severity(cper_hdrs[i]->error_severity); + cper_entry->revision = cper_hdrs[i]->revision; + if (cper_hdrs[i]->cper_valid_bits.valid_bits.timestamp) { + cper_entry->timestamp = + timestamp_string_from_cper_timestamp( + &cper_hdrs[i]->timestamp); + } + cper_entry->creator_id = std::string(cper_hdrs[i]->creator_id); + cper_entry->notification_type = + smi_to_aga_cper_notification_type(cper_hdrs[i]->notify_type); + // get AMD field ids from the cper record + cper_buffer += prev_cper_record_size; + // initialize num_af_id to be the size of the array + cper_entry->num_af_id = AGA_GPU_MAX_AF_ID_PER_CPER; + afid_status = amdsmi_get_afids_from_cper(cper_buffer, + cper_hdrs[i]->record_length, cper_entry->af_id, + &cper_entry->num_af_id); + if (afid_status != AMDSMI_STATUS_SUCCESS) { + cper_entry->num_af_id = 0; + AGA_TRACE_ERR("Failed to get AMD field id for CPER entry for " + "GPU {}, err {}", gpu_handle, status); + } + // update prev_cper_record_size + prev_cper_record_size = cper_hdrs[i]->record_length; + } + } + + // free allocated memory + free(cper_data); + return SDK_RET_OK; +} + } // namespace aga diff --git a/sw/nic/gpuagent/api/smi/gimamdsmi/smi_utils.hpp b/sw/nic/gpuagent/api/smi/gimamdsmi/smi_utils.hpp index 57ea87f..22ad81c 100644 --- a/sw/nic/gpuagent/api/smi/gimamdsmi/smi_utils.hpp +++ b/sw/nic/gpuagent/api/smi/gimamdsmi/smi_utils.hpp @@ -367,6 +367,75 @@ aga_to_smi_gpu_memory_partition_type ( return AMDSMI_MEMORY_PARTITION_UNKNOWN; } +/// \brief convert amdsmi CPER severity to aga CPER severity +/// \param[in] amdsmi CPER severity +/// \return aga CPER severity +static inline aga_cper_severity_t +smi_to_aga_cper_severity (amdsmi_cper_sev_t severity) +{ + switch (severity) { + case AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED; + case AMDSMI_CPER_SEV_FATAL: + return AGA_CPER_SEVERITY_FATAL; + case AMDSMI_CPER_SEV_NON_FATAL_CORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_CORRECTED; + default: + break; + } + + return AGA_CPER_SEVERITY_NONE; +} + +/// \brief convert amdsmi CPER notification type to aga CPER notification type +/// \param[in] amdsmi CPER notification type in amdsmi_cper_guid_t format +/// \return aga CPER notification type +static inline aga_cper_notification_type_t +smi_to_aga_cper_notification_type (amdsmi_cper_guid_t ntfn_type) +{ + uint64_t amdsmi_ntfn_type; + + amdsmi_ntfn_type = (uint64_t)ntfn_type.b[0] | + ((uint64_t)ntfn_type.b[1] << 8) | + ((uint64_t)ntfn_type.b[2] << 16) | + ((uint64_t)ntfn_type.b[3] << 24) | + ((uint64_t)ntfn_type.b[4] << 32) | + ((uint64_t)ntfn_type.b[5] << 40) | + ((uint64_t)ntfn_type.b[6] << 48) | + ((uint64_t)ntfn_type.b[7] << 56); + + switch (amdsmi_ntfn_type) { + case AMDSMI_CPER_NOTIFY_TYPE_CMC: + return AGA_CPER_NOTIFICATION_TYPE_CMC; + case AMDSMI_CPER_NOTIFY_TYPE_CPE: + return AGA_CPER_NOTIFICATION_TYPE_CPE; + case AMDSMI_CPER_NOTIFY_TYPE_MCE: + return AGA_CPER_NOTIFICATION_TYPE_MCE; + case AMDSMI_CPER_NOTIFY_TYPE_PCIE: + return AGA_CPER_NOTIFICATION_TYPE_PCIE; + case AMDSMI_CPER_NOTIFY_TYPE_INIT: + return AGA_CPER_NOTIFICATION_TYPE_INIT; + case AMDSMI_CPER_NOTIFY_TYPE_NMI: + return AGA_CPER_NOTIFICATION_TYPE_NMI; + case AMDSMI_CPER_NOTIFY_TYPE_BOOT: + return AGA_CPER_NOTIFICATION_TYPE_BOOT; + case AMDSMI_CPER_NOTIFY_TYPE_DMAR: + return AGA_CPER_NOTIFICATION_TYPE_DMAR; + case AMDSMI_CPER_NOTIFY_TYPE_SEA: + return AGA_CPER_NOTIFICATION_TYPE_SEA; + case AMDSMI_CPER_NOTIFY_TYPE_SEI: + return AGA_CPER_NOTIFICATION_TYPE_SEI; + case AMDSMI_CPER_NOTIFY_TYPE_PEI: + return AGA_CPER_NOTIFICATION_TYPE_PEI; + case AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT: + return AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT; + default: + break; + } + + return AGA_CPER_NOTIFICATION_TYPE_NONE; +} + /// \brief convert amdsmi return status to sdk return status /// \param[in] amdsmi_ret amdsmi return status /// \return sdk return status diff --git a/sw/nic/gpuagent/api/smi/smi_api.hpp b/sw/nic/gpuagent/api/smi/smi_api.hpp index fe69bf8..9097bef 100644 --- a/sw/nic/gpuagent/api/smi/smi_api.hpp +++ b/sw/nic/gpuagent/api/smi/smi_api.hpp @@ -163,6 +163,16 @@ sdk_ret_t smi_gpu_init_immutable_attrs(aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, aga_gpu_status_t *status); +/// \brief function to get GPU CPER entries +/// \param[in] gpu_handle handle of GPU device +/// \param[in] severity severity of CPER entries to be retrieved +/// AGA_CPER_SEVERITY_NONE implies all +/// \param[out] info GPU CPER information +/// \return SDK_RET_OK or error code in case of failure +sdk_ret_t smi_gpu_get_cper_entries(aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, + aga_cper_info_t *info); + /// \@} } // namespace aga diff --git a/sw/nic/gpuagent/api/smi/smi_api_mock.cc b/sw/nic/gpuagent/api/smi/smi_api_mock.cc index 4379b3a..114318f 100644 --- a/sw/nic/gpuagent/api/smi/smi_api_mock.cc +++ b/sw/nic/gpuagent/api/smi/smi_api_mock.cc @@ -675,4 +675,23 @@ smi_gpu_get_bad_page_records (void *gpu_obj, return SDK_RET_OK; } +sdk_ret_t +smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, aga_cper_info_t *info) +{ + info = {}; + auto cper_entry = &info->cper_entry[info->num_cper_entry++]; + + cper_entry->record_id = "7:1"; + cper_entry->severity = AGA_CPER_SEVERITY_FATAL; + cper_entry->revision = 256; + + cper_entry->timestamp = "2025-09-12 15:00:27"; + cper_entry->notification_type = AGA_CPER_NOTIFICATION_TYPE_MCE; + cper_entry->creator_id = "amdgpu"; + cper_entry->num_af_id = 1; + cper_entry->af_id[0] = 30; + return SDK_RET_OK; +} + } // namespace aga diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go b/sw/nic/gpuagent/cli/cmd/gpu.go index dca9e31..a3f65e0 100644 --- a/sw/nic/gpuagent/cli/cmd/gpu.go +++ b/sw/nic/gpuagent/cli/cmd/gpu.go @@ -61,6 +61,7 @@ var ( memClkFreqLo uint32 memClkFreqHi uint32 printHdr bool + severity string ) const ( @@ -101,6 +102,13 @@ var gpuBadPageShowCmd = &cobra.Command{ RunE: gpuBadPageShowCmdHandler, } +var gpuCPERShowCmd = &cobra.Command{ + Use: "cper-records", + Short: "show GPU CPER records", + Long: "show GPU CPER information", + RunE: gpuCPERShowCmdHandler, +} + var gpuStatsShowCmd = &cobra.Command{ Use: "statistics", Short: "show GPU statistics", @@ -149,6 +157,14 @@ func init() { gpuBadPageShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + gpuShowCmd.AddCommand(gpuCPERShowCmd) + gpuCPERShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", + "Specify GPU id") + gpuCPERShowCmd.Flags().StringVarP(&severity, "severity", "s", "all", + "Specify CPER severity (\"fatal\", \"non-fatal-uncorrected\", "+ + "\"non-fatal-corrected\" or \"all\")") + gpuCPERShowCmd.Flags().BoolP("json", "j", false, "Output in json") + DebugUpdateCmd.AddCommand(gpuUpdateCmd) gpuUpdateCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") gpuUpdateCmd.Flags().StringVarP(&gpuAdminState, "admin-state", "a", "", @@ -226,8 +242,8 @@ func NewGPUComputePartition(resp *aga.GPUComputePartition) *ShadowGPUComputePart func printGPUPartitionsJson(resp *aga.GPUComputePartition) { partition := NewGPUComputePartition(resp) - b, _ := json.MarshalIndent(partition, " ", " ") - fmt.Println(string(b)) + b, _ := json.MarshalIndent(partition, " ", " ") + fmt.Printf(" %s", string(b)) } func gpuPartitionsShowCmdHandler(cmd *cobra.Command, args []string) error { @@ -310,6 +326,137 @@ func gpuPartitionsShowCmdHandler(cmd *cobra.Command, args []string) error { return nil } +type ShadowGPUCPEREntry struct { + GPU string + CPEREntry []*aga.CPEREntry +} + +func NewCPER(cper *aga.GPUCPEREntry) *ShadowGPUCPEREntry { + return &ShadowGPUCPEREntry{ + GPU: utils.IdToStr(cper.GetGPU()), + CPEREntry: cper.GetCPEREntry(), + } +} + +func printGPUCPEREntryJson(cper *aga.GPUCPEREntry) { + b, _ := json.MarshalIndent(NewCPER(cper), " ", " ") + fmt.Printf(" %s", string(b)) +} + +func gpuCPERShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUCPERGetResponse{} + var req *aga.GPUCPERGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUCPERGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUCPERGetRequest{ + Id: [][]byte{}, + } + } + switch strings.ToLower(severity) { + case "all": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NONE + case "non-fatal-uncorrected": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_UNCORRECTED + case "fatal": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_FATAL + case "non-fatal-corrected": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_CORRECTED + default: + return fmt.Errorf("Invalid value specified for \"--severity\"") + } + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUCPERGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU CPER failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + // print CPER information + if cmd != nil && cmd.Flags().Changed("json") { + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("[\n") + } + rcvdResp := false + for _, cper := range respMsg.CPER { + if rcvdResp == true { + // json output requires a , after each GPU + fmt.Printf(",\n") + } + printGPUCPEREntryJson(cper) + rcvdResp = true + } + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("\n]\n") + } + } else { + hdrLine := strings.Repeat("-", 156) + fmt.Println(hdrLine) + fmt.Printf("%-20s%-40s%-16s%-25s%-10s%-10s%-15s%-20s\n", + "Timestamp", "GPU", "RecordId", "Severity", "Revision", "CreatorId", + "NtfnType", "AMDFieldId") + fmt.Println(hdrLine) + for _, cper := range respMsg.CPER { + gpuStr := utils.IdToStr(cper.GetGPU()) + for _, entry := range cper.GetCPEREntry() { + severityStr := strings.Replace(entry.GetSeverity().String(), + "CPER_SEVERITY_", "", -1) + ntfnTypeStr := + strings.Replace(entry.GetNotificationType().String(), + "CPER_NOTIFICATION_TYPE_", "", -1) + ntfnTypeStr = strings.Replace(ntfnTypeStr, "_", "-", -1) + + var afIdBuilder strings.Builder + indent := strings.Repeat(" ", 121) + for i, afId := range entry.GetAFId() { + afIdBuilder.WriteString(strconv.FormatUint(afId, 10)) + if (i+1)%3 == 0 { + if i != len(entry.GetAFId())-1 { + afIdBuilder.WriteString("\n" + indent) + } + } else if i != len(entry.GetAFId())-1 { + afIdBuilder.WriteString(", ") + } + } + afIdStr := afIdBuilder.String() + fmt.Printf("%-20s%-40s%-16s%-25s%-10d%-10s%-15s%-20s\n", + entry.GetTimestamp(), gpuStr, entry.GetRecordId(), + severityStr, entry.GetRevision(), entry.GetCreatorId(), + ntfnTypeStr, afIdStr) + } + } + } + return nil +} func gpuShowCmdHandler(cmd *cobra.Command, args []string) error { if len(args) > 0 { return fmt.Errorf("Invalid argument") @@ -693,8 +840,8 @@ func printGPUStatus(gpu *aga.GPU, statusOnly bool) { fmt.Printf(indent+"%-38s : %d\n", "DRM render id", status.GetDRMRenderId()) fmt.Printf(indent+"%-38s : %d\n", "DRM card id", status.GetDRMCardId()) fmt.Printf(indent+"%-38s : %s\n", "Virtualization mode", - strings.ToLower(strings.Replace(status.GetVirtualizationMode().String(), - "GPU_VIRTUALIZATION_MODE_", "", -1))) + strings.ToLower(strings.Replace(status.GetVirtualizationMode().String(), + "GPU_VIRTUALIZATION_MODE_", "", -1))) fmt.Printf(indent+"%-38s : 0x%x\n", "GPU handle", status.GetGPUHandle()) if status.GetSerialNum() != "" { fmt.Printf(indent+"%-38s : %s\n", "Serial number", @@ -728,7 +875,7 @@ func printGPUStatus(gpu *aga.GPU, statusOnly bool) { case aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_NONE: break default: - fmt.Printf(indent+"%-38s : %d\n", "Partition ID", + fmt.Printf(indent+"%-38s : %d\n", "Partition Id", status.GetPartitionId()) } fwVer := status.GetFirmwareVersion() @@ -1555,8 +1702,8 @@ func NewGPU(resp *aga.GPU) *ShadowGPU { func printGPUJson(resp *aga.GPU) { gpu := NewGPU(resp) - b, _ := json.MarshalIndent(gpu, " ", " ") - fmt.Println(string(b)) + b, _ := json.MarshalIndent(gpu, " ", " ") + fmt.Printf(" %s", string(b)) } func gpuUpdateCmdPreRunE(cmd *cobra.Command, args []string) error { diff --git a/sw/nic/gpuagent/protos/gpu.proto b/sw/nic/gpuagent/protos/gpu.proto index 6b87b9e..dd8a2e2 100644 --- a/sw/nic/gpuagent/protos/gpu.proto +++ b/sw/nic/gpuagent/protos/gpu.proto @@ -41,6 +41,8 @@ service GPUSvc { rpc GPUMemoryPartitionSet(GPUMemoryPartitionSetRequest) returns (GPUMemoryPartitionSetResponse) {} // GPU memory partition get API rpc GPUMemoryPartitionGet(GPUMemoryPartitionGetRequest) returns (GPUMemoryPartitionGetResponse) {} + // GPU CPER get API + rpc GPUCPERGet (GPUCPERGetRequest) returns (GPUCPERGetResponse) {} // operational APIs or tasks // GPU reset API @@ -715,10 +717,10 @@ message GPUGetRequest { message GPUGetResponse { // result of the API processing types.ApiStatus ApiStatus = 1; - // list of per GPU information - repeated GPU Response = 2; // specific error code, if any - types.ErrorCode ErrorCode = 3; + types.ErrorCode ErrorCode = 2; + // list of per GPU information + repeated GPU Response = 3; } // GPU update request message @@ -855,3 +857,93 @@ message GPUMemoryPartitionGetResponse { // list of per GPU information repeated GPUMemoryPartition Response = 3; } + +// CPER error severity +enum CPERSeverity { + // invalid severity + CPER_SEVERITY_NONE = 0; + // non-fatal uncorrected errors + CPER_SEVERITY_NON_FATAL_UNCORRECTED = 1; + // fatal errors + CPER_SEVERITY_FATAL = 2; + // non-fatal corrected errors + CPER_SEVERITY_NON_FATAL_CORRECTED = 3; +} + +// GPU CPER get request +message GPUCPERGetRequest { + // list of GPU uuids + repeated bytes Id = 1; + // severity of CPER entries to be retrieved + // CPER_SEVERITY_NONE implies all CPER records + CPERSeverity Severity = 2; +} + +// CPER notify type +enum CPERNotificationType { + // invalid notification type + CPER_NOTIFICATION_TYPE_NONE = 0; + // Corrected Memory Check (CMC) + CPER_NOTIFICATION_TYPE_CMC = 1; + // Corrected Platform Error (CPE) + CPER_NOTIFICATION_TYPE_CPE = 2; + // Machine Check Exception (MCE) + CPER_NOTIFICATION_TYPE_MCE = 3; + // PCI express error + CPER_NOTIFICATION_TYPE_PCIE = 4; + // initialization error + CPER_NOTIFICATION_TYPE_INIT = 5; + // Non-Maskable Interrupt (NMI) + CPER_NOTIFICATION_TYPE_NMI = 6; + // boot error + CPER_NOTIFICATION_TYPE_BOOT = 7; + // Direct Memory Access Remapping (DMAR) error + CPER_NOTIFICATION_TYPE_DMAR = 8; + // System Error Architecture (SEA) + CPER_NOTIFICATION_TYPE_SEA = 9; + // System Error Interface (SEI) + CPER_NOTIFICATION_TYPE_SEI = 10; + // Platform Error Interface (PEI) + CPER_NOTIFICATION_TYPE_PEI = 11; + // Compute Express Link (CXL) component error + CPER_NOTIFICATION_TYPE_CXL_COMPONENT = 12; +} + +// CPER entry information +message CPEREntry { + // CPER entry identifier + string RecordId = 1; + // CPER error severity + CPERSeverity Severity = 2; + // CPER format revision + uint32 Revision = 3; + // CPER error timestamp + string Timestamp = 4; + // CPER entry creator + string CreatorId = 5; + // CPER notify type + CPERNotificationType NotificationType = 6; + // AMD field ids in CPER entry + // NOTE: + // https://docs.amd.com/r/en-US/AMD_Field_ID_70122_v1.0/AFID-Event-List has + // more information about AMD field ids and their usage + repeated uint64 AFId = 7; +} + +// GPU CPER entry +message GPUCPEREntry { + // GPU id + bytes GPU = 1; + // CPER entries + repeated CPEREntry CPEREntry = 2; +} + +// GPU CPER get response +message GPUCPERGetResponse { + // result of the API processing + types.ApiStatus ApiStatus = 1; + // specific error code, if any + types.ErrorCode ErrorCode = 2; + // GPU CPER entries + repeated GPUCPEREntry CPER = 3; +} diff --git a/sw/nic/gpuagent/svc/gpu.cc b/sw/nic/gpuagent/svc/gpu.cc index e68400a..acf1769 100644 --- a/sw/nic/gpuagent/svc/gpu.cc +++ b/sw/nic/gpuagent/svc/gpu.cc @@ -119,3 +119,16 @@ GPUSvcImpl::GPUMemoryPartitionGet(ServerContext *context, proto_rsp->set_errorcode(sdk_ret_to_error_code(ret)); return Status::OK; } + +Status +GPUSvcImpl::GPUCPERGet(ServerContext *context, + const GPUCPERGetRequest *proto_req, + GPUCPERGetResponse *proto_rsp) { + sdk_ret_t ret; + + ret = aga_svc_gpu_cper_get(proto_req, proto_rsp); + proto_rsp->set_apistatus(sdk_ret_to_api_status(ret)); + proto_rsp->set_errorcode(sdk_ret_to_error_code(ret)); + return Status::OK; +} + diff --git a/sw/nic/gpuagent/svc/gpu.hpp b/sw/nic/gpuagent/svc/gpu.hpp index 0c9a192..01b916d 100644 --- a/sw/nic/gpuagent/svc/gpu.hpp +++ b/sw/nic/gpuagent/svc/gpu.hpp @@ -64,6 +64,9 @@ using amdgpu::GPUMemoryPartitionSetRequest; using amdgpu::GPUMemoryPartitionSetResponse; using amdgpu::GPUMemoryPartitionGetRequest; using amdgpu::GPUMemoryPartitionGetResponse; +using amdgpu::GPUCPEREntry; +using amdgpu::GPUCPERGetRequest; +using amdgpu::GPUCPERGetResponse; class GPUSvcImpl final : public GPUSvc::Service { public: @@ -88,6 +91,9 @@ class GPUSvcImpl final : public GPUSvc::Service { Status GPUMemoryPartitionGet(ServerContext *context, const GPUMemoryPartitionGetRequest *proto_req, GPUMemoryPartitionGetResponse *proto_rsp) override; + Status GPUCPERGet(ServerContext *context, + const GPUCPERGetRequest *proto_req, + GPUCPERGetResponse *proto_rsp) override; }; class DebugGPUSvcImpl final : public DebugGPUSvc::Service { diff --git a/sw/nic/gpuagent/svc/gpu_svc.hpp b/sw/nic/gpuagent/svc/gpu_svc.hpp index 9fb17a4..e577989 100644 --- a/sw/nic/gpuagent/svc/gpu_svc.hpp +++ b/sw/nic/gpuagent/svc/gpu_svc.hpp @@ -329,4 +329,36 @@ aga_svc_gpu_memory_partition_get ( } +static inline sdk_ret_t +aga_svc_gpu_cper_get (const GPUCPERGetRequest *proto_req, + GPUCPERGetResponse *proto_rsp) +{ + sdk_ret_t ret; + aga_obj_key_t key = k_aga_obj_key_invalid; + aga_cper_severity_t severity = AGA_CPER_SEVERITY_NONE; + + if (proto_req == NULL) { + proto_rsp->set_apistatus(types::ApiStatus::API_STATUS_INVALID_ARG); + return SDK_RET_INVALID_ARG; + } + aga_api_trace_verbose("GPU", "Get", proto_req); + severity = aga_cper_severity_to_spec(proto_req->severity()); + if (proto_req->id_size() == 0) { + ret = aga_gpu_cper_read(&key, severity, aga_gpu_cper_api_info_to_proto, + proto_rsp); + proto_rsp->set_apistatus(sdk_ret_to_api_status(ret)); + } + for (int i = 0; i < proto_req->id_size(); i ++) { + aga_obj_key_proto_to_api_spec(&key, proto_req->id(i)); + ret = aga_gpu_cper_read(&key, severity, aga_gpu_cper_api_info_to_proto, + proto_rsp); + if (unlikely(ret != SDK_RET_OK)) { + proto_rsp->set_apistatus(sdk_ret_to_api_status(ret)); + break; + } + proto_rsp->set_apistatus(types::ApiStatus::API_STATUS_OK); + } + return ret; +} + #endif // __AGA_SVC_GPU_SVC_HPP__ diff --git a/sw/nic/gpuagent/svc/gpu_to_proto.hpp b/sw/nic/gpuagent/svc/gpu_to_proto.hpp index a8911ed..479887d 100644 --- a/sw/nic/gpuagent/svc/gpu_to_proto.hpp +++ b/sw/nic/gpuagent/svc/gpu_to_proto.hpp @@ -724,4 +724,101 @@ aga_gpu_api_info_to_proto (aga_gpu_info_t *info, void *ctxt) aga_gpu_api_stats_to_proto(proto_stats, &info->stats); } +// convert aga cper severity to proto +static inline amdgpu::CPERSeverity +aga_cper_severity_to_proto (aga_cper_severity_t severity) +{ + switch (severity) { + case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED: + return amdgpu::CPER_SEVERITY_NON_FATAL_UNCORRECTED; + break; + case AGA_CPER_SEVERITY_FATAL: + return amdgpu::CPER_SEVERITY_FATAL; + break; + case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED: + return amdgpu::CPER_SEVERITY_NON_FATAL_CORRECTED; + break; + default: + break; + } + return amdgpu::CPER_SEVERITY_NONE; +} + +// convert aga cper notification type to proto +static inline amdgpu::CPERNotificationType +aga_cper_notification_type_to_proto (aga_cper_notification_type_t ntfn_type) +{ + switch (ntfn_type) { + case AGA_CPER_NOTIFICATION_TYPE_CMC: + return amdgpu::CPER_NOTIFICATION_TYPE_CMC; + break; + case AGA_CPER_NOTIFICATION_TYPE_CPE: + return amdgpu::CPER_NOTIFICATION_TYPE_CPE; + break; + case AGA_CPER_NOTIFICATION_TYPE_MCE: + return amdgpu::CPER_NOTIFICATION_TYPE_MCE; + break; + case AGA_CPER_NOTIFICATION_TYPE_PCIE: + return amdgpu::CPER_NOTIFICATION_TYPE_PCIE; + break; + case AGA_CPER_NOTIFICATION_TYPE_INIT: + return amdgpu::CPER_NOTIFICATION_TYPE_INIT; + break; + case AGA_CPER_NOTIFICATION_TYPE_NMI: + return amdgpu::CPER_NOTIFICATION_TYPE_NMI; + break; + case AGA_CPER_NOTIFICATION_TYPE_BOOT: + return amdgpu::CPER_NOTIFICATION_TYPE_BOOT; + break; + case AGA_CPER_NOTIFICATION_TYPE_DMAR: + return amdgpu::CPER_NOTIFICATION_TYPE_DMAR; + break; + case AGA_CPER_NOTIFICATION_TYPE_SEA: + return amdgpu::CPER_NOTIFICATION_TYPE_SEA; + break; + case AGA_CPER_NOTIFICATION_TYPE_SEI: + return amdgpu::CPER_NOTIFICATION_TYPE_SEI; + break; + case AGA_CPER_NOTIFICATION_TYPE_PEI: + return amdgpu::CPER_NOTIFICATION_TYPE_PEI; + break; + case AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT: + return amdgpu::CPER_NOTIFICATION_TYPE_CXL_COMPONENT; + break; + default: + break; + } + return amdgpu::CPER_NOTIFICATION_TYPE_NONE; +} + +// populate gpu cper information proto buf +static inline void +aga_gpu_cper_api_info_to_proto (aga_cper_info_t *info, + void *ctxt) +{ + GPUCPEREntry *cper; + GPUCPERGetResponse *proto_rsp = (GPUCPERGetResponse *)ctxt; + + if (!info->num_cper_entry) { + return; + } + cper = proto_rsp->add_cper(); + cper->set_gpu(info->gpu.id, OBJ_MAX_KEY_LEN); + for (uint32_t i = 0; i < info->num_cper_entry; i++) { + auto cper_entry = cper->add_cperentry(); + cper_entry->set_recordid(info->cper_entry[i].record_id); + cper_entry->set_severity( + aga_cper_severity_to_proto(info->cper_entry[i].severity)); + cper_entry->set_revision(info->cper_entry[i].revision); + cper_entry->set_timestamp(info->cper_entry[i].timestamp); + cper_entry->set_creatorid(info->cper_entry[i].creator_id); + cper_entry->set_notificationtype( + aga_cper_notification_type_to_proto( + info->cper_entry[i].notification_type)); + for (uint32_t j = 0; j < info->cper_entry[i].num_af_id; j++) { + cper_entry->add_afid(info->cper_entry[i].af_id[j]); + } + } +} + #endif // __AGA_SVC_GPU_TO_PROTO_HPP__ diff --git a/sw/nic/gpuagent/svc/gpu_to_spec.hpp b/sw/nic/gpuagent/svc/gpu_to_spec.hpp index d4e329d..92b3a86 100644 --- a/sw/nic/gpuagent/svc/gpu_to_spec.hpp +++ b/sw/nic/gpuagent/svc/gpu_to_spec.hpp @@ -31,6 +31,26 @@ limitations under the License. #include "nic/gpuagent/api/include/aga_gpu.hpp" #include "nic/gpuagent/api/include/aga_task.hpp" +// convert aga cper severity to spec +static inline aga_cper_severity_t +aga_cper_severity_to_spec (amdgpu::CPERSeverity severity) +{ + switch (severity) { + case amdgpu::CPER_SEVERITY_NON_FATAL_UNCORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED; + break; + case amdgpu::CPER_SEVERITY_FATAL: + return AGA_CPER_SEVERITY_FATAL; + break; + case amdgpu::CPER_SEVERITY_NON_FATAL_CORRECTED: + return AGA_CPER_SEVERITY_NON_FATAL_CORRECTED; + break; + default: + break; + } + return AGA_CPER_SEVERITY_NONE; +} + static inline aga_gpu_admin_state_t aga_gpu_admin_state_to_spec (amdgpu::GPUAdminState admin_state) {