diff --git a/sw/nic/gpuagent/api/include/aga_gpu.hpp b/sw/nic/gpuagent/api/include/aga_gpu.hpp index 3af1b34..187359b 100644 --- a/sw/nic/gpuagent/api/include/aga_gpu.hpp +++ b/sw/nic/gpuagent/api/include/aga_gpu.hpp @@ -576,6 +576,32 @@ typedef struct aga_gpu_violation_stats_s { uint64_t vr_thermal_residency_accumulated; /// High Bandwidth Memory (HBM) thermal residency accumulated uint64_t hbm_thermal_residency_accumulated; + /// processor hot residency percentage + uint64_t processor_hot_residency_percentage; + /// Package Power Tracking (PPT) residency percentage + uint64_t ppt_residency_percentage; + /// socket thermal residency percentage + uint64_t socket_thermal_residency_percentage; + /// Voltage Rail (VR) thermal residency percentage + uint64_t vr_thermal_residency_percentage; + /// High Bandwidth Memory (HBM) thermal residency percentage + uint64_t hbm_thermal_residency_percentage; + /// gfx clock below host limit power accumulated per XCC + uint64_t gfx_clk_below_host_limit_power_accumulated[AGA_GPU_MAX_XCC]; + /// gfx clock below host limit thermal accumulated per XCC + uint64_t gfx_clk_below_host_limit_thermal_accumulated[AGA_GPU_MAX_XCC]; + /// gfx low utilization accumulated per XCC + uint64_t gfx_low_utilization_accumulated[AGA_GPU_MAX_XCC]; + /// gfx clock below host limit total accumulated per XCC + uint64_t gfx_clk_below_host_limit_total_accumulated[AGA_GPU_MAX_XCC]; + /// gfx clock below host limit power percentage per XCC + uint64_t gfx_clk_below_host_limit_power_percentage[AGA_GPU_MAX_XCC]; + /// gfx clock below host limit thermal percentage per XCC + uint64_t gfx_clk_below_host_limit_thermal_percentage[AGA_GPU_MAX_XCC]; + /// gfx low utilization percentage per XCC + uint64_t gfx_low_utilization_percentage[AGA_GPU_MAX_XCC]; + /// gfx clock below host limit total percentage per XCC + uint64_t gfx_clk_below_host_limit_total_percentage[AGA_GPU_MAX_XCC]; } aga_gpu_violation_stats_t; /// \brief GPU statistics diff --git a/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig b/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig new file mode 100644 index 0000000..3af1b34 --- /dev/null +++ b/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig @@ -0,0 +1,987 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +//---------------------------------------------------------------------------- +/// +/// \file +/// GPU spec, status, stats and APIs +/// +//---------------------------------------------------------------------------- + +#ifndef __API_INCLUDE_AGA_GPU_HPP__ +#define __API_INCLUDE_AGA_GPU_HPP__ + +#include "nic/sdk/include/sdk/base.hpp" +#include "nic/gpuagent/api/include/base.hpp" +#include "nic/gpuagent/api/smi/smi.hpp" + +#define AGA_GPU_MAX_CLOCK_FREQUENCY 6 +#define AGA_GPU_MAX_HBM 4 +#define AGA_GPU_MAX_FIRMWARE_VERSION 85 +#define AGA_GPU_MAX_KFD_PID 8 +#define AGA_GPU_MAX_VOLTAGE_CURVE_POINT 4 +#define AGA_GPU_MIN_OVERDRIVE_LEVEL 0 +#define AGA_GPU_MAX_OVERDRIVE_LEVEL 20 +#define AGA_MAX_PEER_DEVICE 16 +#define AGA_GPU_GFX_MAX_CLOCK 8 +#define AGA_GPU_MEM_MAX_CLOCK 1 +#define AGA_GPU_VIDEO_MAX_CLOCK 4 +#define AGA_GPU_DATA_MAX_CLOCK 4 +#define AGA_GPU_MAX_VCN 4 +#define AGA_GPU_MAX_JPEG 32 +#define AGA_GPU_MAX_JPEG_ENG 40 +#define AGA_GPU_MAX_XCC 8 +#define AGA_GPU_MAX_XGMI_LINKS 8 +#define AGA_GPU_MAX_BAD_PAGE_RECORD 64 +#define AGA_GPU_INVALID_PARTITION_ID 0xFFFFFFFF +#define AGA_GPU_MAX_PARTITION 8 +#define AGA_GPU_MAX_CPER_ENTRY 128 +#define AGA_GPU_MAX_AF_ID_PER_CPER 12 + +/// number of clocks that can not be configured - AGA_GPU_CLOCK_TYPE_FABRIC, +/// AGA_GPU_CLOCK_TYPE_SOC (4), AGA_GPU_CLOCK_TYPE_DCE, AGA_GPU_CLOCK_TYPE_PCIE +#define AGA_GPU_NUM_NON_CFG_CLOCK_TYPES 7 +/// clock frequency range are per clock type; as of now it is only set for +/// clocks of type AGA_GPU_CLOCK_TYPE_SYSTEM, AGA_GPU_CLOCK_TYPE_MEMORY, +/// AGA_GPU_CLOCK_TYPE_VIDEO and AGA_GPU_CLOCK_TYPE_DATA +#define AGA_GPU_NUM_CFG_CLOCK_TYPES 4 +/// total number of clocks; for non configurable clocks we assume 1 of each type +#define AGA_GPU_MAX_CLOCK \ + (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK + \ + AGA_GPU_VIDEO_MAX_CLOCK + AGA_GPU_DATA_MAX_CLOCK + \ + AGA_GPU_NUM_NON_CFG_CLOCK_TYPES) + +/// \brief GPU admin state +typedef enum aga_gpu_admin_state_e { + AGA_GPU_ADMIN_STATE_NONE = 0, + /// admin UP + AGA_GPU_ADMIN_STATE_UP = 1, + /// admin DOWN + AGA_GPU_ADMIN_STATE_DOWN = 2, +} aga_gpu_admin_state_t; + +/// \brief GPU clock types +typedef enum aga_gpu_clock_type_e { + AGA_GPU_CLOCK_TYPE_NONE = 0, + /// fabric clock (aka. fclk) + AGA_GPU_CLOCK_TYPE_FABRIC = 1, + /// memory clock (aka. mclk) + AGA_GPU_CLOCK_TYPE_MEMORY = 2, + /// system clock (aka. sclk) + AGA_GPU_CLOCK_TYPE_SYSTEM = 3, + /// SoC clock (aka. socclk) + AGA_GPU_CLOCK_TYPE_SOC = 4, + /// Display Controller Engine (DCE) clock + AGA_GPU_CLOCK_TYPE_DCE = 5, + /// PCIe clock + AGA_GPU_CLOCK_TYPE_PCIE = 6, + /// video clock + AGA_GPU_CLOCK_TYPE_VIDEO = 7, + /// data clock + AGA_GPU_CLOCK_TYPE_DATA = 8, + AGA_GPU_CLOCK_TYPE_MAX = 9, +} aga_gpu_clock_type_t; + +/// \brief GPU performance levels +typedef enum aga_gpu_perf_level_e { + AGA_GPU_PERF_LEVEL_NONE = 0, + /// performance level auto + AGA_GPU_PERF_LEVEL_AUTO = 1, + /// performance level low + AGA_GPU_PERF_LEVEL_LOW = 2, + /// performance level high + AGA_GPU_PERF_LEVEL_HIGH = 3, + /// minimum performance variation + AGA_GPU_PERF_LEVEL_DETERMINISTIC = 4, + /// stable power state with minimum memory clock + AGA_GPU_PERF_LEVEL_STABLE_WITH_MCLK = 5, + /// stable power state with minimum system clock + AGA_GPU_PERF_LEVEL_STABLE_WITH_SCLK = 6, + /// manual performance level + AGA_GPU_PERF_LEVEL_MANUAL = 7, +} aga_gpu_perf_level_t; + +/// \brief GPU throttling status +typedef enum aga_gpu_throttling_status_e { + AGA_GPU_THROTTLING_STATUS_NONE = 0, + /// throttling off + AGA_GPU_THROTTLING_STATUS_OFF = 1, + /// throttling on + AGA_GPU_THROTTLING_STATUS_ON = 2, +} aga_gpu_throttling_status_t; + +/// \brief GPU clock frequency range +/// NOTE: +/// values are in MHz +typedef struct aga_gpu_clock_freq_range_s { + /// clock type + aga_gpu_clock_type_t clock_type; + /// low frequency value + uint32_t lo; + /// high frequency value + uint32_t hi; +} aga_gpu_clock_freq_range_t; + +/// \brief RAS configuration per block +typedef struct aga_gpu_ras_spec_s { + // TODO: + // fill this +} aga_gpu_ras_spec_t; + +/// GPU compute partition type +typedef enum aga_gpu_compute_partition_type_e { + /// unknown/invalid partition type + AGA_GPU_COMPUTE_PARTITION_TYPE_NONE = 0, + /// single-partition GPU mode + AGA_GPU_COMPUTE_PARTITION_TYPE_SPX = 1, + /// dual-partition GPU mode + AGA_GPU_COMPUTE_PARTITION_TYPE_DPX = 2, + /// triple-partition GPU mode + AGA_GPU_COMPUTE_PARTITION_TYPE_TPX = 3, + /// quad-partition GPU mode + AGA_GPU_COMPUTE_PARTITION_TYPE_QPX = 4, + /// core-partition GPU mode + AGA_GPU_COMPUTE_PARTITION_TYPE_CPX = 5, +} aga_gpu_compute_partition_type_t; + +/// GPU memory partition type +typedef enum aga_gpu_memory_partition_type_e { + /// unknown/invalid partition type + AGA_GPU_MEMORY_PARTITION_TYPE_NONE = 0, + /// one NUMA per socket + AGA_GPU_MEMORY_PARTITION_TYPE_NPS1 = 1, + /// two NUMA per socket + AGA_GPU_MEMORY_PARTITION_TYPE_NPS2 = 2, + /// four NUMA per socket + AGA_GPU_MEMORY_PARTITION_TYPE_NPS4 = 3, + /// eight NUMA per socket + AGA_GPU_MEMORY_PARTITION_TYPE_NPS8 = 4, +} aga_gpu_memory_partition_type_t; + +/// \brief GPU specification +typedef struct aga_gpu_spec_s { + /// uuid of gpu + /// NOTE: in case of partition, all child GPUs share the UUID of the parent + /// GPU; to differentiate between them we encode the partition ID in + /// the 4 bytes starting from byte 4 of the parent GPU UUID; + /// example : parent GPU - 1eff74a1-0000-1000-807e-1746627a9cd7 + /// child GPUs - 1eff74a1-0000-0000-807e-1746627a9cd7 + /// 1eff74a1-0000-0001-807e-1746627a9cd7 + /// 1eff74a1-0000-0002-807e-1746627a9cd7 + /// 1eff74a1-0000-0003-807e-1746627a9cd7 + aga_obj_key_t key; + /// uuid of parent gpu in case of partitioning + aga_obj_key_t parent_gpu; + /// admin state + aga_gpu_admin_state_t admin_state; + /// GPU clock overdrive level (as percentage) + uint32_t overdrive_level; + /// max GPU power in power overdrive (in Watts) + uint64_t gpu_power_cap; + /// GPU performance level + aga_gpu_perf_level_t perf_level; + /// number of clock frequencies + uint32_t num_clock_freqs; + /// clock frequency ranges + aga_gpu_clock_freq_range_t clock_freq[AGA_GPU_NUM_CFG_CLOCK_TYPES]; + /// fan speed + uint64_t fan_speed; + /// GPU RAS configuration + aga_gpu_ras_spec_t ras_spec; + /// GPU compute partition type + aga_gpu_compute_partition_type_t compute_partition_type; + /// GPU memory partition type + aga_gpu_memory_partition_type_t memory_partition_type; +} aga_gpu_spec_t; + +/// \brief GPU operational state +typedef enum aga_gpu_oper_state_e { + AGA_GPU_OPER_STATE_NONE = 0, + /// operationally up + AGA_GPU_OPER_STATE_UP = 1, + /// operationall down + AGA_GPU_OPER_STATE_DOWN = 2, +} aga_gpu_oper_state_t; + +/// GPU temperature sensor type +typedef enum aga_gpu_temp_sensor_type_e { + AGA_GPU_TEMP_TYPE_EDGE = 0, + AGA_GPU_TEMP_TYPE_JUNCTION = 1, + AGA_GPU_TEMP_TYPE_MEMORY = 2, + AGA_GPU_TEMP_TYPE_HBM_0 = 3, + AGA_GPU_TEMP_TYPE_HBM_1 = 4, + AGA_GPU_TEMP_TYPE_HBM_2 = 5, + AGA_GPU_TEMP_TYPE_HBM_3 = 6, + AGA_GPU_TEMP_TYPE_INVALID = 7, +} aga_gpu_temp_sensor_type_t; + +/// \brief GPU temperature information +/// NOTE: +/// all temperatures are in centigrade +typedef struct aga_gpu_temperature_stats_s { + /// edge temperature + float edge_temperature; + /// junction temperature + float junction_temperature; + /// memory temperature + float memory_temperature; + /// HBM temperatures + float hbm_temperature[AGA_GPU_MAX_HBM]; +} aga_gpu_temperature_stats_t; + +/// \brief GPU usage status as percentage of time GPU is busy +typedef struct aga_gpu_usage_s { + uint32_t gfx_activity; + uint32_t umc_activity; + uint32_t mm_activity; + uint16_t vcn_activity[AGA_GPU_MAX_VCN]; + uint16_t jpeg_activity[AGA_GPU_MAX_JPEG]; + uint32_t gfx_busy_inst[AGA_GPU_MAX_XCC]; + uint16_t jpeg_busy[AGA_GPU_MAX_JPEG_ENG]; + uint16_t vcn_busy[AGA_GPU_MAX_VCN]; +} aga_gpu_usage_t; + +/// \brief GPU current memory usage +typedef struct aga_gpu_memory_usage_s { + /// percentage of available memory in use + float memory_usage; + float activity; +} aga_gpu_memory_usage_t; + +/// \brief GPU clock information +typedef struct aga_gpu_clock_status_s { + /// clock type + aga_gpu_clock_type_t clock_type; + /// clock frequency in MHz + uint32_t frequency; + /// low frequency value + uint32_t low_frequency; + /// high frequency value + uint32_t high_frequency; + /// clock is locked or not + bool locked; + /// clock is in deep sleep or not + bool deep_sleep; +} aga_gpu_clock_status_t; + +/// \brief GPU voltage curve point +typedef struct aga_gpu_voltage_curve_point_s { + /// curve point + uint32_t point; + /// frequency in MHz + uint32_t frequency; + /// voltage + uint32_t voltage; +} aga_gpu_voltage_curve_point_t; + +/// \brief GPU reliability, availability & servicability status +typedef struct aga_gpu_ras_status_s { + // TODO: + // fill this +} aga_gpu_ras_status_t; + + +/// \brief GPU xgmi error status +typedef enum aga_gpu_xgmi_error_status_e { + AGA_GPU_XGMI_STATUS_NONE = 0, + /// no errors since last read + AGA_GPU_XGMI_STATUS_NO_ERROR = 1, + /// one error detected since last read + AGA_GPU_XGMI_STATUS_ONE_ERROR = 2, + /// multiple errors detected since last read + AGA_GPU_XGMI_STATUS_MULTIPLE_ERROR = 3, +} aga_gpu_xgmi_error_status_t; + +/// \brief GPU XGMI status +typedef struct aga_gpu_xgmi_status_s { + /// XGMI error status + aga_gpu_xgmi_error_status_t error_status; + /// XGMI link width in GB/s + uint64_t width; + /// XGMI link speed in GB/s + uint64_t speed; +} aga_gpu_xgmi_status_t; + +/// \brief GPU firmware version +typedef struct aga_gpu_fw_version_s { + /// name of the component + char firmware[AGA_MAX_STR_LEN + 1]; + /// firmware version of component + char version[AGA_MAX_STR_LEN + 1]; +} aga_gpu_fw_version_t; + +/// \brief PCIe slot type +typedef enum aga_pcie_slot_type_e { + AGA_PCIE_SLOT_TYPE_NONE = 0, + AGA_PCIE_SLOT_TYPE_PCIE = 1, + AGA_PCIE_SLOT_TYPE_OAM = 2, + AGA_PCIE_SLOT_TYPE_CEM = 3, + AGA_PCIE_SLOT_TYPE_UNKNOWN = 4, +} aga_pcie_slot_type_t; + +/// \brief PCIe status +typedef struct aga_gpu_pcie_status_s { + /// PCIe card form factor + aga_pcie_slot_type_t slot_type; + /// pcie bus id + char pcie_bus_id[AGA_MAX_STR_LEN + 1]; + /// maximum number of PCIe lanes + uint32_t max_width; + /// maximum PCIe speed + uint32_t max_speed; + /// PCIe interface version + uint32_t version; + /// current PCIe lanes + uint32_t width; + /// current PCIe speed (in GT/s) + uint32_t speed; + /// current PCIe bandwidth (in MB/s) + uint64_t bandwidth; +} aga_gpu_pcie_status_t; + +/// \brief VRAM type +typedef enum aga_vram_type_e { + AGA_VRAM_TYPE_NONE = 0, + AGA_VRAM_TYPE_HBM = 1, + AGA_VRAM_TYPE_HBM2 = 2, + AGA_VRAM_TYPE_HBM2E = 3, + AGA_VRAM_TYPE_HBM3 = 4, + AGA_VRAM_TYPE_DDR2 = 5, + AGA_VRAM_TYPE_DDR3 = 6, + AGA_VRAM_TYPE_DDR4 = 7, + AGA_VRAM_TYPE_GDDR1 = 8, + AGA_VRAM_TYPE_GDDR2 = 9, + AGA_VRAM_TYPE_GDDR3 = 10, + AGA_VRAM_TYPE_GDDR4 = 11, + AGA_VRAM_TYPE_GDDR5 = 12, + AGA_VRAM_TYPE_GDDR6 = 13, + AGA_VRAM_TYPE_GDDR7 = 14, + AGA_VRAM_TYPE_UNKNOWN = 15, +} aga_vram_type_t; + +/// \brief VRAM vendor +typedef enum aga_vram_vendor_e { + AGA_VRAM_VENDOR_NONE = 0, + AGA_VRAM_VENDOR_SAMSUNG = 1, + AGA_VRAM_VENDOR_INFINEON = 2, + AGA_VRAM_VENDOR_ELPIDA = 3, + AGA_VRAM_VENDOR_ETRON = 4, + AGA_VRAM_VENDOR_NANYA = 5, + AGA_VRAM_VENDOR_HYNIX = 6, + AGA_VRAM_VENDOR_MOSEL = 7, + AGA_VRAM_VENDOR_WINBOND = 8, + AGA_VRAM_VENDOR_ESMT = 9, + AGA_VRAM_VENDOR_MICRON = 10, + AGA_VRAM_VENDOR_UNKNOWN = 11, +} aga_vram_vendor_t; + +/// \brief VRAM status +typedef struct aga_gpu_vram_status_s { + /// VRAM type + aga_vram_type_t type; + /// VRAM vendor + char vendor[AGA_MAX_STR_LEN + 1]; + /// VRAM size (in MB) + uint64_t size; +} aga_gpu_vram_status_t; + +/// \brief GPU page status +typedef enum aga_gpu_page_status_e { + AGA_GPU_PAGE_STATUS_NONE = 0, + AGA_GPU_PAGE_STATUS_RESERVED = 1, + AGA_GPU_PAGE_STATUS_PENDING = 2, + AGA_GPU_PAGE_STATUS_UNRESERVABLE = 3, +} aga_gpu_page_status_t; + +/// \brief GPU bad page record +typedef struct aga_gpu_bad_page_record_s { + /// GPU key + aga_obj_key_t key; + /// page start address + uint64_t page_address; + /// page size + uint64_t page_size; + /// page status + aga_gpu_page_status_t page_status; +} aga_gpu_bad_page_record_t; + +/// \brief GPU virtualization mode +typedef enum aga_gpu_virtualization_mode_e { + AGA_VIRTUALIZATION_MODE_NONE = 0, + AGA_VIRTUALIZATION_MODE_UNKNOWN = AGA_VIRTUALIZATION_MODE_NONE, + AGA_VIRTUALIZATION_MODE_BAREMETAL = 1, + AGA_VIRTUALIZATION_MODE_HOST = 2, + AGA_VIRTUALIZATION_MODE_GUEST = 3, + AGA_VIRTUALIZATION_MODE_PASSTHROUGH = 4, +} aga_gpu_virtualization_mode_t; + +/// \brief operational information of a physical GPU +typedef struct aga_gpu_status_s { + /// assigned GPU index local to compute node + uint32_t index; + /// handle of gpu + aga_gpu_handle_t handle; + /// serial number + char serial_num[AGA_MAX_STR_LEN + 1]; + /// GPU product series + char card_series[AGA_MAX_STR_LEN + 1]; + /// GPU model + char card_model[AGA_MAX_STR_LEN + 1]; + /// GPU vendor + char card_vendor[AGA_MAX_STR_LEN + 1]; + /// GPU sku + char card_sku[AGA_MAX_STR_LEN + 1]; + /// driver version + char driver_version[AGA_MAX_STR_LEN + 1]; + /// vbios part number + char vbios_part_number[AGA_MAX_STR_LEN + 1]; + /// vbios version + char vbios_version[AGA_MAX_STR_LEN + 1]; + /// number of fw versions + uint32_t num_fw_versions; + /// firmware versions of various components + aga_gpu_fw_version_t fw_version[AGA_GPU_MAX_FIRMWARE_VERSION]; + /// memory component vendor + char memory_vendor[AGA_MAX_STR_LEN + 1]; + /// operational status of the device + aga_gpu_oper_state_t oper_status; + /// number of clock status + uint32_t num_clock_status; + /// GPU clock status + aga_gpu_clock_status_t clock_status[AGA_GPU_MAX_CLOCK]; + /// number of Kenral Fusion Driver process ids using the GPU + uint32_t num_kfd_process_id; + /// Kernel Fusion Driver (KFD) process ids using the GPU + uint32_t kfd_process_id[AGA_GPU_MAX_KFD_PID]; + /// GPU RAS status + aga_gpu_ras_status_t ras_status; + /// xgmi status + aga_gpu_xgmi_status_t xgmi_status; + /// PCIe status + aga_gpu_pcie_status_t pcie_status; + /// VRAM status + aga_gpu_vram_status_t vram_status; + /// voltage curve points + aga_gpu_voltage_curve_point_t voltage_curve_point[AGA_GPU_MAX_VOLTAGE_CURVE_POINT]; + /// GPU throttling status + aga_gpu_throttling_status_t throttling_status; + /// firmware timestamp + uint64_t fw_timestamp; + /// GPU partition id + uint32_t partition_id; + /// GPU partitions (aka. child GPUs) + /// NOTE: + /// only valid for physical GPUs which have been partitioned + uint32_t num_gpu_partition; + aga_obj_key_t gpu_partition[AGA_GPU_MAX_PARTITION]; + /// physical GPU (aka. parent GPU) + /// NOTE: + /// only valid for GPU partitions (child GPUs) + aga_obj_key_t physical_gpu; + // GPU KFD id + uint64_t kfd_id; + // GPU node id + uint32_t node_id; + // GPU driver DRM render id + uint32_t drm_render_id; + // GPU driver DRM card id + uint32_t drm_card_id; + // GPU virtualization mode + aga_gpu_virtualization_mode_t virtualization_mode; +} aga_gpu_status_t; + +/// \brief GPU PCIe statistics +typedef struct aga_gpu_pcie_stats_s { + /// total number of the replays issued on the PCIe link + uint64_t replay_count; + /// total number of times PCIe link transitioned from L0 to recovery state + uint64_t recovery_count; + /// total number of replay rollovers issued on the PCIe link + uint64_t replay_rollover_count; + /// total number of NACKs issued on the PCIe link by the device + uint64_t nack_sent_count; + /// total number of NACKs issued on the PCIe link by the receiver + uint64_t nack_received_count; + /// accumulated bytes received from the PCIe link + uint64_t rx_bytes; + /// accumulated bytes transmitted to the PCIe link + uint64_t tx_bytes; + /// accumulated combined bandwidth on PCIe link (GB/sec) + uint64_t bidir_bandwidth; +} aga_gpu_pcie_stats_t; + +/// \brief GPU voltage statistics +typedef struct aga_gpu_voltage_s { + /// current voltage (in mV) + uint64_t voltage; + /// current graphics voltage (in mV) + uint64_t gfx_voltage; + /// current memory voltage (in mV) + uint64_t memory_voltage; +} aga_gpu_voltage_t; + +/// \brief GPU VRAM usage statistics +typedef struct aga_gpu_vram_usage_s { + /// total VRAM (in MB) + uint64_t total_vram; + /// used VRAM (in MB) + uint64_t used_vram; + /// free VRAM (in MB) + uint64_t free_vram; + /// total visible VRAM (in MB) + uint64_t total_visible_vram; + /// used visible VRAM (in MB) + uint64_t used_visible_vram; + /// free visible VRAM (in MB) + uint64_t free_visible_vram; + /// total Graphic Translation Table (GTT) (in MB) + uint64_t total_gtt; + /// used GTT (in MB) + uint64_t used_gtt; + /// free GTT (in MB) + uint64_t free_gtt; +} aga_gpu_vram_usage_t; + +/// \brief GPU XGMI link statistics +typedef struct aga_gpu_xgmi_link_stats_s { + /// data read in KB + uint64_t data_read; + /// data written in KB + uint64_t data_write; +} aga_gpu_xgmi_link_stats_t; + +/// \brief GPU violation statistics +typedef struct aga_gpu_violation_stats_s { + /// current acummulated counter + uint64_t current_accumulated_counter; + /// processor hot residency accumulated + uint64_t processor_hot_residency_accumulated; + /// Package Power Tracking (PPT) residency accumulated + uint64_t ppt_residency_accumulated; + /// socket thermal residency accumulated + uint64_t socket_thermal_residency_accumulated; + /// Voltage Rail (VR) thermal residency accumulated + uint64_t vr_thermal_residency_accumulated; + /// High Bandwidth Memory (HBM) thermal residency accumulated + uint64_t hbm_thermal_residency_accumulated; +} aga_gpu_violation_stats_t; + +/// \brief GPU statistics +typedef struct aga_gpu_stats_s { + /// current graphics package power (in Watts) + uint64_t package_power; + /// average package power (in Watts) + uint64_t avg_package_power; + /// current temperature + aga_gpu_temperature_stats_t temperature; + /// current GPU usage + aga_gpu_usage_t usage; + /// current voltage (in mV) + aga_gpu_voltage_t voltage; + /// GPU PCIe stats + aga_gpu_pcie_stats_t pcie_stats; + /// GPU VRAM usage stats + aga_gpu_vram_usage_t vram_usage; + /// accumulated energy consumed (in uJ) + double energy_consumed; + /// power usage (in Watts) + uint32_t power_usage; + /// total correctable errors + uint64_t total_correctable_errors; + /// total uncorrectable errors + uint64_t total_uncorrectable_errors; + /// SDMA correctable errors + uint64_t sdma_correctable_errors; + /// SDMA uncorrectable errors + uint64_t sdma_uncorrectable_errors; + /// GFX correctable errors + uint64_t gfx_correctable_errors; + /// GFX uncorrectable errors + uint64_t gfx_uncorrectable_errors; + /// MMHUB correctable errors + uint64_t mmhub_correctable_errors; + /// MMHUB uncorrectable errors + uint64_t mmhub_uncorrectable_errors; + /// ATHUB correctable errors + uint64_t athub_correctable_errors; + /// ATHUB uncorrectable errors + uint64_t athub_uncorrectable_errors; + /// BIF correctable errors + uint64_t bif_correctable_errors; + /// BIF uncorrectable errors + uint64_t bif_uncorrectable_errors; + /// HDP correctable errors + uint64_t hdp_correctable_errors; + /// HDP uncorrectable errors + uint64_t hdp_uncorrectable_errors; + /// XGMI WAFL correctable errors + uint64_t xgmi_wafl_correctable_errors; + /// XGMI WAFL uncorrectable errors + uint64_t xgmi_wafl_uncorrectable_errors; + /// DF correctable errors + uint64_t df_correctable_errors; + /// DF uncorrectable errors + uint64_t df_uncorrectable_errors; + /// SMN correctable errors + uint64_t smn_correctable_errors; + /// SMN uncorrectable errors + uint64_t smn_uncorrectable_errors; + /// SEM correctable errors + uint64_t sem_correctable_errors; + /// SEM uncorrectable errors + uint64_t sem_uncorrectable_errors; + /// MP0 correctable errors + uint64_t mp0_correctable_errors; + /// MP0 uncorrectable errors + uint64_t mp0_uncorrectable_errors; + /// MP1 correctable errors + uint64_t mp1_correctable_errors; + /// MP1 uncorrectable errors + uint64_t mp1_uncorrectable_errors; + /// FUSE correctable errors + uint64_t fuse_correctable_errors; + /// FUSE uncorrectable errors + uint64_t fuse_uncorrectable_errors; + /// UMC correctable errors + uint64_t umc_correctable_errors; + /// UMC uncorrectable errors + uint64_t umc_uncorrectable_errors; + /// MCA correctable errors + uint64_t mca_correctable_errors; + /// MCA uncorrectable errors + uint64_t mca_uncorrectable_errors; + /// VCN correctable errors + uint64_t vcn_correctable_errors; + /// VCN uncorrectable errors + uint64_t vcn_uncorrectable_errors; + /// JPEG correctable errors + uint64_t jpeg_correctable_errors; + /// JPEG uncorrectable errors + uint64_t jpeg_uncorrectable_errors; + /// IH correctable errors + uint64_t ih_correctable_errors; + /// IH uncorrectable errors + uint64_t ih_uncorrectable_errors; + /// MPIO correctable errors + uint64_t mpio_correctable_errors; + /// MPIO uncorrectable errors + uint64_t mpio_uncorrectable_errors; + /// XGMI counters + /// NOPs sent to neighbor0 + uint64_t xgmi_neighbor0_tx_nops; + /// outgoing requests to neighbor0 + uint64_t xgmi_neighbor0_tx_requests; + /// outgoing responses to neighbor0 + uint64_t xgmi_neighbor0_tx_responses; + /// data beats sent to neighbor0 (each beat = 32 Bytes) + uint64_t xgmi_neighbor0_tx_beats; + /// NOPs sent to neighbor1 + uint64_t xgmi_neighbor1_tx_nops; + /// outgoing requests to neighbor1 + uint64_t xgmi_neighbor1_tx_requests; + /// outgoing responses to neighbor1 + uint64_t xgmi_neighbor1_tx_responses; + /// data beats sent to neighbor1 (each beat = 32 Bytes) + uint64_t xgmi_neighbor1_tx_beats; + /// transmit throughput to XGMI neighbor 0 (in Bytes per second) + uint64_t xgmi_neighbor0_tx_throughput; + /// transmit throughput to XGMI neighbor 1 (in Bytes per second) + uint64_t xgmi_neighbor1_tx_throughput; + /// transmit throughput to XGMI neighbor 2 (in Bytes per second) + uint64_t xgmi_neighbor2_tx_throughput; + /// transmit throughput to XGMI neighbor 3 (in Bytes per second) + uint64_t xgmi_neighbor3_tx_throughput; + /// transmit throughput to XGMI neighbor 4 (in Bytes per second) + uint64_t xgmi_neighbor4_tx_throughput; + /// transmit throughput to XGMI neighbor 5 (in Bytes per second) + uint64_t xgmi_neighbor5_tx_throughput; + /// fan speed in RPMs + uint64_t fan_speed; + /// graphics activity accumulated in % + uint64_t gfx_activity_accumulated; + /// memory activity accumulated in % + uint64_t mem_activity_accumulated; + /// XGMI link statistics + aga_gpu_xgmi_link_stats_t xgmi_link_stats[AGA_GPU_MAX_XGMI_LINKS]; + /// GPU violation statistics + aga_gpu_violation_stats_t violation_stats; +} aga_gpu_stats_t; + +/// GPU info +typedef struct aga_gpu_info_s { + aga_gpu_spec_t spec; + aga_gpu_status_t status; + aga_gpu_stats_t stats; +} aga_gpu_info_t; + +/// device type enum +typedef enum aga_device_type_e { + AGA_DEVICE_TYPE_NONE = 0, + /// GPU device + AGA_DEVICE_TYPE_GPU = 1, +} aga_device_type_t; + +/// IO link type enum +typedef enum aga_io_link_type_e { + AGA_IO_LINK_TYPE_NONE = 0, + /// PCIe connection to the device + AGA_IO_LINK_TYPE_PCIE = 1, + /// XGMI connection to the device + AGA_IO_LINK_TYPE_XGMI = 2, +} aga_io_link_type_t; + +typedef struct aga_device_connection_s { + /// IO link type of connection + aga_io_link_type_t type; +} aga_device_connection_t; + +/// device structure +typedef struct aga_device_s { + /// device type + aga_device_type_t type; + /// device name + char name[AGA_MAX_STR_LEN + 1]; +} aga_device_t; + +/// peer device info +typedef struct aga_peer_device_s { + /// peer device valid + bool valid; + /// peer device + aga_device_t peer_device; + /// connection details to the peer device + aga_device_connection_t connection; + /// distance in terms of no. of hops to the peer device + uint64_t num_hops; + /// weight assigned to the connection to peer device + uint64_t link_weight; +} aga_peer_device_t; + +/// device topology info +typedef struct aga_device_topology_info_s { + /// device under consideration + aga_device_t device; + /// list of peer devices and corresponding inter-connection details + aga_peer_device_t peer_device[AGA_MAX_PEER_DEVICE]; +} aga_device_topology_info_t; + +/// GPU compute partition info +typedef struct aga_gpu_compute_partition_info_s { + /// physical GPU + aga_obj_key_t physical_gpu; + /// compute partition type + aga_gpu_compute_partition_type_t partition_type; + /// GPU partitions (child GPUs) + uint32_t num_gpu_partition; + aga_obj_key_t gpu_partition[AGA_GPU_MAX_PARTITION]; +} aga_gpu_compute_partition_info_t; + +/// GPU memory partition info +typedef struct aga_gpu_memory_partition_info_s { + /// physical GPU + aga_obj_key_t physical_gpu; + /// memory partition type + aga_gpu_memory_partition_type_t partition_type; +} aga_gpu_memory_partition_info_t; + +/// CPER severity +typedef enum aga_cper_severity_e { + /// invalid severity + AGA_CPER_SEVERITY_NONE = 0, + /// non-fatal uncorrected errors + AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED = 1, + /// fatal errors + AGA_CPER_SEVERITY_FATAL = 2, + /// non-fatal corrected errors + AGA_CPER_SEVERITY_NON_FATAL_CORRECTED = 3, +} aga_cper_severity_t; + +/// CPER notification type +typedef enum aga_cper_notification_type_e { + /// invalid notification type + AGA_CPER_NOTIFICATION_TYPE_NONE = 0, + /// Corrected Memory Check (CMC) + AGA_CPER_NOTIFICATION_TYPE_CMC = 1, + /// Corrected Platform Error (CPE) + AGA_CPER_NOTIFICATION_TYPE_CPE = 2, + /// Machine Check Exception (MCE) + AGA_CPER_NOTIFICATION_TYPE_MCE = 3, + /// PCI express error + AGA_CPER_NOTIFICATION_TYPE_PCIE = 4, + /// initialization error + AGA_CPER_NOTIFICATION_TYPE_INIT = 5, + /// Non-Maskable Interrupt (NMI) + AGA_CPER_NOTIFICATION_TYPE_NMI = 6, + /// boot error + AGA_CPER_NOTIFICATION_TYPE_BOOT = 7, + /// Direct Memory Access Remapping (DMAR) error + AGA_CPER_NOTIFICATION_TYPE_DMAR = 8, + /// System Error Architecture (SEA) + AGA_CPER_NOTIFICATION_TYPE_SEA = 9, + /// System Error Interface (SEI) + AGA_CPER_NOTIFICATION_TYPE_SEI = 10, + /// Platform Error Interface (PEI) + AGA_CPER_NOTIFICATION_TYPE_PEI = 11, + /// Compute Express Link component error + AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT = 12, +} aga_cper_notification_type_t; + +/// CPER entry information +typedef struct aga_cper_entry_s { + /// CPER entry identifier + std::string record_id; + /// CPER error severity + aga_cper_severity_t severity; + /// CPER format revision + uint32_t revision; + /// CPER error timestamp + std::string timestamp; + /// CPER entry creator identifier + std::string creator_id; + /// CPER entry notification type + aga_cper_notification_type_t notification_type; + /// number of AMD field ids + uint32_t num_af_id; + /// AMD field ids + uint64_t af_id[AGA_GPU_MAX_AF_ID_PER_CPER]; +} aga_cper_entry_t; + +/// CPER information +typedef struct aga_cper_info_s { + /// GPU uuid + aga_obj_key_t gpu; + /// number of cper entries + uint32_t num_cper_entry; + /// cper entries + aga_cper_entry_t cper_entry[AGA_GPU_MAX_CPER_ENTRY]; +} aga_cper_info_t; + +/// \brief create gpu +/// \param[in] spec config specification +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_create(_In_ aga_gpu_spec_t *spec); + +/// \brief read gpu +/// \param[in] key key of the gpu object +/// \param[out] info information +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_read(_In_ aga_obj_key_t *key, _Out_ aga_gpu_info_t *info); + +typedef void (*gpu_read_cb_t)(aga_gpu_info_t *info, void *ctxt); + +/// \brief read all gpu information +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_read_all(_In_ gpu_read_cb_t gpu_read_cb, _In_ void *ctxt); + +/// \brief function to get compute partition info of a given physical gpu +/// which has been partitioned +/// \param[in] key key of the physical gpu object which has been partitioned +/// \param[out] info information +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_compute_partition_read(_In_ aga_obj_key_t *key, + _Out_ aga_gpu_compute_partition_info_t *info); + +typedef void (*gpu_compute_partition_read_cb_t) + (aga_gpu_compute_partition_info_t *info, void *ctxt); + +/// \brief read compute partition info of all physical gpus which have been +/// partitioned +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_compute_partition_read_all( + _In_ gpu_compute_partition_read_cb_t gpu_read_cb, + _In_ void *ctxt); + +/// \brief function to set compute partition type for a GPU +/// \param[in] spec spec of the GPU including information about compute +/// partition type to be set +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_compute_partition_set(_In_ aga_gpu_spec_t *spec); + +/// \brief function to set memory partition type for a GPU +/// \param[in] spec spec of the GPU including information about memory +/// partition type to be set +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_memory_partition_set(_In_ aga_gpu_spec_t *spec); + +/// \brief function to get memory partition info of a given physical gpu +/// \param[in] key key of the physical gpu object +/// \param[out] info information +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_memory_partition_read(_In_ aga_obj_key_t *key, + _Out_ aga_gpu_memory_partition_info_t *info); + +typedef void (*gpu_memory_partition_read_cb_t) + (aga_gpu_memory_partition_info_t *info, void *ctxt); + +/// \brief read memory partition info of all physical gpus +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_memory_partition_read_all( + _In_ gpu_memory_partition_read_cb_t gpu_read_cb, + _In_ void *ctxt); + +typedef void (*gpu_bad_page_read_cb_t)(uint32_t num_bad_pages, + aga_gpu_bad_page_record_t *records, + void *ctxt); + +/// \brief read gpu bad page records +/// \param[in] key key of the gpu object, if k_aga_obj_key_invalid we read +/// bad page records of all gpu +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_bad_page_read(_In_ aga_obj_key_t *key, + _In_ gpu_bad_page_read_cb_t gpu_bad_page_read_cb, + _In_ void *ctxt); + +typedef void (*device_topology_read_cb_t)(aga_device_topology_info_t *info, + void *ctxt); + +/// \brief read all gpu topology information +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_read_topology_all(device_topology_read_cb_t gpu_read_cb, + void *ctxt); + +/// \brief update gpu +/// \param[in] spec specification +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_update(_In_ aga_gpu_spec_t *spec); + +/// \brief delete gpu object +/// \param[in] key key +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_delete(_In_ aga_obj_key_t *key); + +typedef void (*gpu_cper_read_cb_t)(aga_cper_info_t *info, void *ctxt); + +/// \brief read gpu CPER records +/// \param[in] key key of the gpu object, if k_aga_obj_key_invalid we read +/// CPER records of all gpu +/// \param[in] cb callback function +/// \param[in] ctxt opaque context passed to cb +/// \return #SDK_RET_OK on success, failure status code on error +sdk_ret_t aga_gpu_cper_read(_In_ aga_obj_key_t *key, + _In_ aga_cper_severity_t severity, + _In_ gpu_cper_read_cb_t gpu_cper_read_cb, + _In_ void *ctxt); + +#endif /// __API_INCLUDE_AGA_GPU_HPP__ diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc index 3e0b180..d6fbb1a 100644 --- a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc @@ -940,6 +940,108 @@ smi_gpu_get_bad_page_records (void *gpu_obj, return SDK_RET_OK; } +static sdk_ret_t +smi_fill_violation_stats_ (aga_gpu_handle_t gpu_handle, + uint32_t partition_id, + amdsmi_gpu_metrics_t *metrics_info, + aga_gpu_violation_stats_t *stats) +{ + amdsmi_status_t amdsmi_ret; + amdsmi_violation_status_t violation_status = {}; + + // initialize stats to invalid values + memset(stats, 0xff, sizeof(aga_gpu_violation_stats_t)); + + amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}", + gpu_handle, amdsmi_ret); + if (!partition_id) { + // fill non partition stats from metrics info only + // for primary parition + stats->current_accumulated_counter = + metrics_info->accumulation_counter; + stats->processor_hot_residency_accumulated = + metrics_info->prochot_residency_acc; + stats->ppt_residency_accumulated = + metrics_info->ppt_residency_acc; + stats->socket_thermal_residency_accumulated = + metrics_info->socket_thm_residency_acc; + stats->vr_thermal_residency_accumulated = + metrics_info->vr_thm_residency_acc; + stats->hbm_thermal_residency_accumulated = + metrics_info->hbm_thm_residency_acc; + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + stats->gfx_clk_below_host_limit_power_accumulated[i] = + metrics_info->xcp_stats[ + partition_id].gfx_below_host_limit_ppt_acc[i]; + stats->gfx_clk_below_host_limit_thermal_accumulated[i] = + metrics_info->xcp_stats[ + partition_id].gfx_below_host_limit_thm_acc[i]; + stats->gfx_low_utilization_accumulated[i] = + metrics_info->xcp_stats[ + partition_id].gfx_low_utilization_acc[i]; + stats->gfx_clk_below_host_limit_total_accumulated[i] = + metrics_info->xcp_stats[ + partition_id].gfx_below_host_limit_total_acc[i]; + } + } else { + if (!partition_id) { + // fill non partition stats from violation status only + // for primary parition + stats->current_accumulated_counter = + violation_status.acc_counter; + stats->processor_hot_residency_accumulated = + violation_status.acc_prochot_thrm; + stats->ppt_residency_accumulated = + violation_status.acc_ppt_pwr; + stats->socket_thermal_residency_accumulated = + violation_status.acc_socket_thrm; + stats->vr_thermal_residency_accumulated = + violation_status.acc_vr_thrm; + stats->hbm_thermal_residency_accumulated = + violation_status.acc_hbm_thrm; + stats->processor_hot_residency_percentage = + violation_status.per_prochot_thrm; + stats->ppt_residency_percentage = + violation_status.per_ppt_pwr; + stats->socket_thermal_residency_percentage = + violation_status.per_socket_thrm; + stats->vr_thermal_residency_percentage = + violation_status.per_vr_thrm; + stats->hbm_thermal_residency_percentage = + violation_status.per_hbm_thrm; + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + stats->gfx_clk_below_host_limit_power_accumulated[i] = + violation_status.acc_gfx_clk_below_host_limit_pwr[ + partition_id][i]; + stats->gfx_clk_below_host_limit_thermal_accumulated[i] = + violation_status.acc_gfx_clk_below_host_limit_thm[ + partition_id][i]; + stats->gfx_low_utilization_accumulated[i] = + violation_status.acc_low_utilization[partition_id][i]; + stats->gfx_clk_below_host_limit_total_accumulated[i] = + violation_status.acc_gfx_clk_below_host_limit_total[ + partition_id][i]; + stats->gfx_clk_below_host_limit_power_percentage[i] = + violation_status.per_gfx_clk_below_host_limit_pwr[ + partition_id][i]; + stats->gfx_clk_below_host_limit_thermal_percentage[i] = + violation_status.per_gfx_clk_below_host_limit_thm[ + partition_id][i]; + stats->gfx_low_utilization_percentage[i] = + violation_status.per_low_utilization[ + partition_id][i]; + stats->gfx_clk_below_host_limit_total_percentage[i] = + violation_status.per_gfx_clk_below_host_limit_total[ + partition_id][i]; + } + } + return SDK_RET_OK; +} + static sdk_ret_t smi_fill_vram_usage_ (aga_gpu_handle_t gpu_handle, aga_gpu_vram_usage_t *usage) @@ -1016,7 +1118,6 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, amdsmi_status_t amdsmi_ret; uint64_t sent, received, max_pkt_size; amdsmi_gpu_metrics_t metrics_info = {}; - amdsmi_violation_status_t violation_status = {}; // fill VRAM usage smi_fill_vram_usage_(gpu_handle, &stats->vram_usage); @@ -1045,36 +1146,10 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, metrics_info.xgmi_write_data_acc[i]; } // fill violation statistics - amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status); - if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { - AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}", - gpu_handle, amdsmi_ret); - // revert to populating from metrics payload - stats->violation_stats.current_accumulated_counter = - metrics_info.accumulation_counter; - stats->violation_stats.processor_hot_residency_accumulated = - metrics_info.prochot_residency_acc; - stats->violation_stats.ppt_residency_accumulated = - metrics_info.ppt_residency_acc; - stats->violation_stats.socket_thermal_residency_accumulated = - metrics_info.socket_thm_residency_acc; - stats->violation_stats.vr_thermal_residency_accumulated = - metrics_info.vr_thm_residency_acc; - stats->violation_stats.hbm_thermal_residency_accumulated = - metrics_info.hbm_thm_residency_acc; - } else { - stats->violation_stats.current_accumulated_counter = - violation_status.acc_counter; - stats->violation_stats.processor_hot_residency_accumulated = - violation_status.acc_prochot_thrm; - stats->violation_stats.ppt_residency_accumulated = - violation_status.acc_ppt_pwr; - stats->violation_stats.socket_thermal_residency_accumulated = - violation_status.acc_socket_thrm; - stats->violation_stats.vr_thermal_residency_accumulated = - violation_status.acc_vr_thrm; - stats->violation_stats.hbm_thermal_residency_accumulated = - violation_status.acc_hbm_thrm; + if (!partition_id) { + smi_fill_violation_stats_(gpu_handle, partition_id, + &metrics_info, + &stats->violation_stats); } // get usage information from the metrics info for partition 0 for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { @@ -1173,6 +1248,10 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, metrics_info.xcp_stats[partition_id].gfx_busy_inst[i]; } } + // get violation stats from first gpu partition for XCP level data + smi_fill_violation_stats_(first_partition_handle, partition_id, + &metrics_info, + &stats->violation_stats); } return SDK_RET_OK; } diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig new file mode 100644 index 0000000..3e0b180 --- /dev/null +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig @@ -0,0 +1,1863 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +//---------------------------------------------------------------------------- +/// +/// \file +/// smi layer API definitions +/// +//---------------------------------------------------------------------------- + +#include +#include +extern "C" { +#include "nic/third-party/rocm/amd_smi_lib/include/amd_smi/amdsmi.h" +} +#include "nic/sdk/include/sdk/base.hpp" +#include "nic/gpuagent/core/trace.hpp" +#include "nic/gpuagent/api/gpu.hpp" +#include "nic/gpuagent/api/aga_state.hpp" +#include "nic/gpuagent/api/smi/smi_api.hpp" +#include "nic/gpuagent/api/smi/smi_state.hpp" +#include "nic/gpuagent/api/smi/amdsmi/smi_utils.hpp" + +// TODO: +// not using aga_ here for proper naming !!! + +namespace aga { + +#define AMDSMI_INVALID_PARTITION_COUNT 0xffff +#define AMDSMI_INVALID_UINT16 0xffff +#define AMDSMI_INVALID_UINT32 0xffffffff +#define AMDSMI_INVALID_UINT64 0xffffffffffffffff +#define AMDSMI_DEEP_SLEEP_THRESHOLD 140 +#define AMDSMI_COUNTER_RESOLUTION 15.3 +#define CPER_BUF_SIZE (4 * 1024 * 1024) // 4 MB + + +/// cache GPU metrics so that we don't do repeated calls while filling spec, +/// status and statistics +std::unordered_map g_gpu_metrics; +/// counter resolution in uJ; this is a constant value that we get once during +/// init time and use whenever we want to calculate energy accumalated +float g_energy_counter_resolution; + +/// \brief struct to be used as ctxt when walking GPU db to build topology +typedef struct gpu_topo_walk_ctxt_s { + uint32_t count; + gpu_entry *gpu; + aga_device_topology_info_t *info; +} gpu_topo_walk_ctxt_t; + +/// \brief fill clock frequency ranges of the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] spec spec to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_clock_frequency_spec_ (aga_gpu_handle_t gpu_handle, + aga_gpu_spec_t *spec) +{ + uint32_t clk_cnt = 0; + amdsmi_status_t amdsmi_ret; + amdsmi_frequencies_t freq = {}; + amdsmi_clk_info_t clock_info = {}; + aga_gpu_clock_freq_range_t *clock_spec; + + // gfx clock + clock_spec = &spec->clock_freq[clk_cnt]; + clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_GFX); + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_GFX, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get system clock frequencies for GPU {}, " + "err {}", gpu_handle, amdsmi_ret); + } else { + // min and max frequencies are per clock type + find_low_high_frequency(&freq, &clock_spec->lo, &clock_spec->hi); + } + clk_cnt++; + // memory clock + clock_spec = &spec->clock_freq[clk_cnt]; + clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_MEM); + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_MEM, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get memory clock frequencies for GPU {}, " + "err {}", gpu_handle, amdsmi_ret); + } else { + // min and max frequencies are per clock type + find_low_high_frequency(&freq, &clock_spec->lo, &clock_spec->hi); + } + clk_cnt++; + // video clock + clock_spec = &spec->clock_freq[clk_cnt]; + clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_VCLK0); + amdsmi_ret = amdsmi_get_clock_info(gpu_handle, AMDSMI_CLK_TYPE_VCLK0, + &clock_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get video clock information for GPU {}, " + "err {}", gpu_handle, amdsmi_ret); + } else { + clock_spec->lo = clock_info.min_clk; + clock_spec->hi = clock_info.max_clk; + } + clk_cnt++; + // data clock + clock_spec = &spec->clock_freq[clk_cnt]; + clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_DCLK0); + amdsmi_ret = amdsmi_get_clock_info(gpu_handle, AMDSMI_CLK_TYPE_DCLK0, + &clock_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get data clock information for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + clock_spec->lo = clock_info.min_clk; + clock_spec->hi = clock_info.max_clk; + } + clk_cnt++; + spec->num_clock_freqs = clk_cnt; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_spec (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec) +{ + uint32_t value_32; + amdsmi_status_t amdsmi_ret; + amdsmi_dev_perf_level_t perf_level = {}; + amdsmi_gpu_metrics_t metrics_info = { 0 }; + amdsmi_power_cap_info_t power_cap_info = {}; + + // clear cached responses + g_gpu_metrics.clear(); + + amdsmi_ret = amdsmi_get_gpu_metrics_info(gpu_handle, &metrics_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + // cache response + g_gpu_metrics[gpu_handle] = metrics_info; + } + // fill the overdrive level + amdsmi_ret = amdsmi_get_gpu_overdrive_level(gpu_handle, &value_32); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get clock overdrive for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + spec->overdrive_level = value_32; + } + // fill the perf level + amdsmi_ret = amdsmi_get_gpu_perf_level(gpu_handle, &perf_level); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get performance level GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + spec->perf_level = smi_to_aga_gpu_perf_level(perf_level); + } + // fill the power cap + amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0, &power_cap_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get power cap information for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + spec->gpu_power_cap = power_cap_info.power_cap/1000000; + } + // TODO: get admin_state + // TODO: get RAS spec + return SDK_RET_OK; +} + +/// \brief function to get name for amdsmi firmware block enum +/// \param[in] block amdsmi firmware block enum +/// \return firmware block name +static inline const char * +gpu_fw_block_name_str_ (amdsmi_fw_block_t block) +{ + switch (block) { + case AMDSMI_FW_ID_SMU: + return "SMU"; + case AMDSMI_FW_ID_CP_CE: + return "CP_CE"; + case AMDSMI_FW_ID_CP_PFP: + return "CP_PFP"; + case AMDSMI_FW_ID_CP_ME: + return "CP_ME"; + case AMDSMI_FW_ID_CP_MEC_JT1: + return "CP_MEC_JT1"; + case AMDSMI_FW_ID_CP_MEC_JT2: + return "CP_MEC_JT2"; + case AMDSMI_FW_ID_CP_MEC1: + return "CP_MEC1"; + case AMDSMI_FW_ID_CP_MEC2: + return "CP_MEC2"; + case AMDSMI_FW_ID_RLC: + return "RLC"; + case AMDSMI_FW_ID_SDMA0: + return "SDMA0"; + case AMDSMI_FW_ID_SDMA1: + return "SDMA1"; + case AMDSMI_FW_ID_SDMA2: + return "SDMA2"; + case AMDSMI_FW_ID_SDMA3: + return "SDMA3"; + case AMDSMI_FW_ID_SDMA4: + return "SDMA4"; + case AMDSMI_FW_ID_SDMA5: + return "SDMA5"; + case AMDSMI_FW_ID_SDMA6: + return "SDMA6"; + case AMDSMI_FW_ID_SDMA7: + return "SDMA7"; + case AMDSMI_FW_ID_VCN: + return "VCN"; + case AMDSMI_FW_ID_UVD: + return "UVD"; + case AMDSMI_FW_ID_VCE: + return "VCE"; + case AMDSMI_FW_ID_ISP: + return "ISP"; + case AMDSMI_FW_ID_DMCU_ERAM: + return "DMCU_ERAM"; + case AMDSMI_FW_ID_DMCU_ISR: + return "DMCU_ISR"; + case AMDSMI_FW_ID_RLC_RESTORE_LIST_GPM_MEM: + return "RLC_GPM_MEM"; + case AMDSMI_FW_ID_RLC_RESTORE_LIST_SRM_MEM: + return "RLC_SRM_MEM"; + case AMDSMI_FW_ID_RLC_RESTORE_LIST_CNTL: + return "RLC_CNTL"; + case AMDSMI_FW_ID_RLC_V: + return "RLC_V"; + case AMDSMI_FW_ID_MMSCH: + return "MMSCH"; + case AMDSMI_FW_ID_PSP_SYSDRV: + return "PSP_SYSDRV"; + case AMDSMI_FW_ID_PSP_SOSDRV: + return "PSP_SOSDRV"; + case AMDSMI_FW_ID_PSP_TOC: + return "PSP_TOC"; + case AMDSMI_FW_ID_PSP_KEYDB: + return "PSP_KEYDB"; + case AMDSMI_FW_ID_DFC: + return "DFC"; + case AMDSMI_FW_ID_PSP_SPL: + return "PSP_SPL"; + case AMDSMI_FW_ID_DRV_CAP: + return "DRV_CAP"; + case AMDSMI_FW_ID_MC: + return "MC"; + case AMDSMI_FW_ID_PSP_BL: + return "PSP_BL"; + case AMDSMI_FW_ID_CP_PM4: + return "CP_PM4"; + case AMDSMI_FW_ID_RLC_P: + return "RLC_P"; + case AMDSMI_FW_ID_SEC_POLICY_STAGE2: + return "SEC_POL_STG2"; + case AMDSMI_FW_ID_REG_ACCESS_WHITELIST: + return "REG_ACCESS_WL"; + case AMDSMI_FW_ID_IMU_DRAM: + return "IMU_DRAM"; + case AMDSMI_FW_ID_IMU_IRAM: + return "IMU_IRAM"; + case AMDSMI_FW_ID_SDMA_TH0: + return "SDMA_TH0"; + case AMDSMI_FW_ID_SDMA_TH1: + return "SDMA_TH1"; + case AMDSMI_FW_ID_CP_MES: + return "CP_MES"; + case AMDSMI_FW_ID_MES_KIQ: + return "MES_KIQ"; + case AMDSMI_FW_ID_MES_STACK: + return "MES_STACK"; + case AMDSMI_FW_ID_MES_THREAD1: + return "MES_THREAD1"; + case AMDSMI_FW_ID_MES_THREAD1_STACK: + return "MES_THREAD1_STACK"; + case AMDSMI_FW_ID_RLX6: + return "RLX6"; + case AMDSMI_FW_ID_RLX6_DRAM_BOOT: + return "RLX6_DRAM_BOOT"; + case AMDSMI_FW_ID_RS64_ME: + return "RS64_ME"; + case AMDSMI_FW_ID_RS64_ME_P0_DATA: + return "RS64_ME_P0_DATA"; + case AMDSMI_FW_ID_RS64_ME_P1_DATA: + return "RS64_ME_P1_DATA"; + case AMDSMI_FW_ID_RS64_PFP: + return "RS64_PFP"; + case AMDSMI_FW_ID_RS64_PFP_P0_DATA: + return "RS64_PFP_P0_DATA"; + case AMDSMI_FW_ID_RS64_PFP_P1_DATA: + return "RS64_PFP_P1_DATA"; + case AMDSMI_FW_ID_RS64_MEC: + return "RS64_MEC"; + case AMDSMI_FW_ID_RS64_MEC_P0_DATA: + return "RS64_MEC_P0_DATA"; + case AMDSMI_FW_ID_RS64_MEC_P1_DATA: + return "RS64_MEC_P1_DATA"; + case AMDSMI_FW_ID_RS64_MEC_P2_DATA: + return "RS64_MEC_P2_DATA"; + case AMDSMI_FW_ID_RS64_MEC_P3_DATA: + return "RS64_MEC_P3_DATA"; + case AMDSMI_FW_ID_PPTABLE: + return "PPTABLE"; + case AMDSMI_FW_ID_PSP_SOC: + return "PSP_SOC"; + case AMDSMI_FW_ID_PSP_DBG: + return "PSP_DBG"; + case AMDSMI_FW_ID_PSP_INTF: + return "PSP_INTF"; + case AMDSMI_FW_ID_RLX6_CORE1: + return "RLX6_CORE1"; + case AMDSMI_FW_ID_RLX6_DRAM_BOOT_CORE1: + return "RLX6_DRAM_BOOT_CORE1"; + case AMDSMI_FW_ID_RLCV_LX7: + return "RLCV_LX7"; + case AMDSMI_FW_ID_RLC_SAVE_RESTORE_LIST: + return "RLC_SAVE_RL"; + case AMDSMI_FW_ID_ASD: + return "ASD"; + case AMDSMI_FW_ID_TA_RAS: + return "TA_RAS"; + case AMDSMI_FW_ID_TA_XGMI: + return "TA_XGMI"; + case AMDSMI_FW_ID_RLC_SRLG: + return "RLC_SRLG"; + case AMDSMI_FW_ID_RLC_SRLS: + return "RLC_SRLS"; + case AMDSMI_FW_ID_PM: + return "PM"; + case AMDSMI_FW_ID_DMCU: + return "DMCU"; + case AMDSMI_FW_ID_PLDM_BUNDLE: + return "PLDM_BUNDLE"; + default: + return (std::string("FW_ID_")+ std::to_string(block)).c_str(); + } +} + +/// \brief function to format firmware version +/// \param[out] fw_version firmware component/version after formatting +/// \param[in] block firmware component enum +/// \param[in] version firmware version +/// \return none +static void +fill_gpu_fw_version_ (aga_gpu_fw_version_t *fw_version, amdsmi_fw_block_t block, + uint64_t version) +{ + char buf[AGA_MAX_STR_LEN + 1]; + std::string block_name = gpu_fw_block_name_str_(block); + + strncpy(fw_version->firmware, block_name.c_str(), AGA_MAX_STR_LEN); + if ((block == AMDSMI_FW_ID_VCN) || (block == AMDSMI_FW_ID_UVD) || + (block == AMDSMI_FW_ID_VCE) || + (block == AMDSMI_FW_ID_ASD) || (block == AMDSMI_FW_ID_CP_MES) || + (block == AMDSMI_FW_ID_MES_KIQ) || (block == AMDSMI_FW_ID_PSP_SOSDRV)) { + // 'VCN', 'VCE', 'UVD', 'SOS', 'ASD', 'MES', 'MES KIQ' fw versions + // needs to hexadecimal + snprintf(buf, AGA_MAX_STR_LEN, "0x%08" PRIx64, version); + strncpy(fw_version->version, buf, AGA_MAX_STR_LEN); + } else if ((block == AMDSMI_FW_ID_TA_XGMI) || + (block == AMDSMI_FW_ID_TA_RAS) || (block == AMDSMI_FW_ID_PM)) { + // TA XGMI, TA RAS, and PM firmware's hex value looks like 0x12345678 + // however, they are parsed as: int(0x12).int(0x34).int(0x56).int(0x78) + // which results in the following: 12.34.56.78 + unsigned char tmp[8]; + for (auto i = 0; i < 8; i++) { + tmp[i] = version >> ((7-i)*8); + } + snprintf(buf, AGA_MAX_STR_LEN, "%02u.%02u.%02u.%02u", + tmp[4], tmp[5], tmp[6], tmp[7]); + strncpy(fw_version->version, buf, AGA_MAX_STR_LEN); + } else { + strncpy(fw_version->version, std::to_string(version).c_str(), + AGA_MAX_STR_LEN); + } +} + +/// \brief get SKU from VBIOS version +/// \param[in] vbios VBIOS part number string +/// \param[out] sku SKU string dervied from vbios version +/// \return none +static void +gpu_get_sku_from_vbios_ (char *sku, char *vbios) +{ + char *buf; + char *token; + + // middle portion in the VBIOS version is SKU XXX--XXX + // get first token + token = strtok_r(vbios, "-", &buf); + if (token == NULL) { + AGA_TRACE_ERR("SKU cannot be derived from vbios version {}", vbios); + return; + } + // second token is the SKU + token = strtok_r(NULL, "-", &buf); + if (token == NULL) { + AGA_TRACE_ERR("SKU cannot be derived from vbios version {}", vbios); + return; + } + strncpy(sku, token, AGA_MAX_STR_LEN); +} + +/// \brief fill GPU enumeration ids info using the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_enumeration_id_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_status_t *status) +{ + amdsmi_kfd_info_t k_info; + amdsmi_status_t amdsmi_ret; + amdsmi_enumeration_info_t e_info; + + amdsmi_ret = amdsmi_get_gpu_kfd_info(gpu_handle, &k_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get kfd info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } + amdsmi_ret = amdsmi_get_gpu_enumeration_info(gpu_handle, &e_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get enumeration info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } + status->kfd_id = k_info.kfd_id; + status->node_id = k_info.node_id; + status->drm_render_id = e_info.drm_render; + status->drm_card_id = e_info.drm_card; + return SDK_RET_OK; +} + +/// \brief fill list of pids using the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_kfd_pid_status_ (aga_gpu_handle_t gpu_handle, + uint32_t gpu_id, aga_gpu_status_t *status) +{ + amdsmi_status_t amdsmi_ret; + uint32_t gpu_list[AGA_MAX_GPU]; + amdsmi_process_info_t *pid_info; + uint32_t value_32, num_pid = 0, num_gpus = AGA_MAX_GPU; + + // kernel fusion driver pids + amdsmi_ret = amdsmi_get_gpu_compute_process_info(NULL, &value_32); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get KFD pid count, err {}", amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } else { + // if pid count is non zero, get the pid info + if (value_32) { + pid_info = + (amdsmi_process_info_t *)malloc(sizeof(amdsmi_process_info_t) * + value_32); + if (pid_info == NULL) { + AGA_TRACE_ERR("Failed to allocate KFD pid buffer, GPU {}"); + return SDK_RET_OOM; + } + amdsmi_ret = amdsmi_get_gpu_compute_process_info(pid_info, + &value_32); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + free(pid_info); + AGA_TRACE_ERR("Failed to get KFD pid info, err {}", amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } + // loop thru pids, get the list of GPUs using each pid and + // update per GPU kfd process list + for (uint32_t i = 0; i < value_32; i++) { + num_gpus = AGA_MAX_GPU; + amdsmi_ret = + amdsmi_get_gpu_compute_process_gpus(pid_info[i].process_id, + gpu_list, &num_gpus); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GPU list of pid {}, err {}", + pid_info[i].process_id, amdsmi_ret); + continue; + } + for (uint32_t j = 0; j < num_gpus; j++) { + if (gpu_list[j] == gpu_id) { + if (num_pid == (AGA_GPU_MAX_KFD_PID - 1)) { + AGA_TRACE_DEBUG("Reached max KFD processes {} " + "using the GPU {}, pid {} is " + "ignored", AGA_GPU_MAX_KFD_PID, + gpu_handle, pid_info[i].process_id); + break; + } + status->kfd_process_id[num_pid++] = + pid_info[i].process_id; + break; + } + } + } + status->num_kfd_process_id = num_pid; + // free pid_info memory + free(pid_info); + } + } + return SDK_RET_OK; +} + +/// \brief fill status of clocks +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_spec_t *spec, aga_gpu_status_t *status, + amdsmi_gpu_metrics_t *metrics_info) +{ + uint32_t clk_cnt = 0; + amdsmi_status_t amdsmi_ret; + uint32_t low_freq, high_freq; + amdsmi_frequencies_t freq = {}; + aga_gpu_clock_status_t *clock_status; + aga_gpu_clock_freq_range_t *mem_clock_spec = NULL; + aga_gpu_clock_freq_range_t *gfx_clock_spec = NULL; + aga_gpu_clock_freq_range_t *data_clock_spec = NULL; + aga_gpu_clock_freq_range_t *video_clock_spec = NULL; + + // get clock specs for different clock types + for (uint32_t i = 0; i < spec->num_clock_freqs; i++) { + if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_SYSTEM) { + gfx_clock_spec = &spec->clock_freq[i]; + break; + } + } + for (uint32_t i = 0; i < spec->num_clock_freqs; i++) { + if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_MEMORY) { + mem_clock_spec = &spec->clock_freq[i]; + break; + } + } + for (uint32_t i = 0; i < spec->num_clock_freqs; i++) { + if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_VIDEO) { + video_clock_spec = &spec->clock_freq[i]; + break; + } + } + for (uint32_t i = 0; i < spec->num_clock_freqs; i++) { + if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_DATA) { + data_clock_spec = &spec->clock_freq[i]; + break; + } + } + clk_cnt = 0; + // gfx clock + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) { + if (gfx_clock_spec) { + clock_status = &status->clock_status[clk_cnt]; + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM; + clock_status->frequency = metrics_info->current_gfxclks[i]; + clock_status->low_frequency = gfx_clock_spec->lo; + clock_status->high_frequency = gfx_clock_spec->hi; + clock_status->locked = + metrics_info->gfxclk_lock_status & (1 << i); + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + } + clk_cnt++; + } + // memory clock + if (mem_clock_spec) { + clock_status = &status->clock_status[clk_cnt]; + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_MEMORY; + clock_status->frequency = metrics_info->current_uclk; + clock_status->low_frequency = mem_clock_spec->lo; + clock_status->high_frequency = mem_clock_spec->hi; + // locked is N/A for memory clock + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + } + clk_cnt++; + // video clocks + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) { + if (video_clock_spec) { + clock_status = &status->clock_status[clk_cnt]; + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_VIDEO; + clock_status->frequency = metrics_info->current_vclk0s[i]; + clock_status->low_frequency = video_clock_spec->lo; + clock_status->high_frequency = video_clock_spec->hi; + // locked is N/A for video clocks + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + } + clk_cnt++; + } + // data clocks + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) { + if (data_clock_spec) { + clock_status = &status->clock_status[clk_cnt]; + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DATA; + clock_status->frequency = metrics_info->current_dclk0s[i]; + clock_status->low_frequency = data_clock_spec->lo; + clock_status->high_frequency = data_clock_spec->hi; + // locked is N/A for data clocks + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + } + clk_cnt++; + } + // SOC clock + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_SOC, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get SOC clock frequencies for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + low_freq = high_freq = 0; + // min and max frequencies are per clock type + find_low_high_frequency(&freq, &low_freq, &high_freq); + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) { + clock_status = &status->clock_status[clk_cnt]; + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SOC; + clock_status->frequency = metrics_info->current_socclks[i]; + clock_status->low_frequency = low_freq; + clock_status->high_frequency = high_freq; + // locked is N/A for SOC clocks + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + clk_cnt++; + } + } + // data fabric clock + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DF, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get data fabric clock frequencies for GPU {}, " + "err {}", gpu_handle, amdsmi_ret); + } else { + low_freq = high_freq = 0; + clock_status = &status->clock_status[clk_cnt]; + // min and max frequencies are per clock type + find_low_high_frequency(&freq, + &clock_status->low_frequency, + &clock_status->high_frequency); + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_FABRIC; + clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + clk_cnt++; + } + // DCE clock + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DCEF, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get DCE clock frequencies for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + low_freq = high_freq = 0; + clock_status = &status->clock_status[clk_cnt]; + // min and max frequencies are per clock type + find_low_high_frequency(&freq, + &clock_status->low_frequency, + &clock_status->high_frequency); + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DCE; + clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + clk_cnt++; + } + // PCIe clock + amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_PCIE, &freq); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get PCIe clock frequencies for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + low_freq = high_freq = 0; + clock_status = &status->clock_status[clk_cnt]; + // min and max frequencies are per clock type + find_low_high_frequency(&freq, + &clock_status->low_frequency, + &clock_status->high_frequency); + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_PCIE; + clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->deep_sleep = + (clock_status->frequency < clock_status->low_frequency); + clk_cnt++; + } + status->num_clock_status = clk_cnt; + return SDK_RET_OK; +} + +/// \brief fill PCIe status +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_pcie_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_status_t *status) +{ + amdsmi_pcie_info_t info; + amdsmi_status_t amdsmi_ret; + aga_gpu_pcie_status_t *pcie_status = &status->pcie_status; + + amdsmi_ret = amdsmi_get_pcie_info(gpu_handle, &info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get PCIe info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + pcie_status->slot_type = + smi_to_aga_pcie_slot_type(info.pcie_static.slot_type); + pcie_status->max_width = info.pcie_static.max_pcie_width; + pcie_status->max_speed = info.pcie_static.max_pcie_speed/1000; + pcie_status->version = info.pcie_static.pcie_interface_version; + pcie_status->width = info.pcie_metric.pcie_width; + pcie_status->speed = info.pcie_metric.pcie_speed/1000; + pcie_status->bandwidth = info.pcie_metric.pcie_bandwidth; + } + return SDK_RET_OK; +} + +/// \brief fill VRAM status +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_vram_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_vram_status_t *status) +{ + amdsmi_vram_info_t info; + amdsmi_status_t amdsmi_ret; + + amdsmi_ret = amdsmi_get_gpu_vram_info(gpu_handle, &info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get VRAM info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + status->type = smi_to_aga_vram_type(info.vram_type); + memcpy(status->vendor, info.vram_vendor, AGA_MAX_STR_LEN); + status->size = info.vram_size; + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_get_gpu_partition_info (aga_gpu_handle_t gpu_handle, bool *capable, + aga_gpu_compute_partition_type_t *compute_partition, + aga_gpu_memory_partition_type_t *memory_partition) +{ + amdsmi_status_t amdsmi_ret; + amdsmi_gpu_metrics_t metrics_info = {}; + char partition_type[AGA_MAX_STR_LEN + 1]; + + *capable = true; + *compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_NONE; + *memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NONE; + // to deduce partition capability of platform, we rely on + // metrics field num_partition of a GPU field to be 0xffff + // on partition supported platform, this api is not supported + // for paritioned GPU other than index 0 or first_handle + // we mark the capablity to true on such cases to specify platform + // partition capability + amdsmi_ret = amdsmi_get_gpu_metrics_info(gpu_handle, + &metrics_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + if ((metrics_info.num_partition & 0xffff) == + AMDSMI_INVALID_PARTITION_COUNT) { + // this is unsupported platform like Mi2xx + *capable = false; + } + } + // fill compute partition type + amdsmi_ret = amdsmi_get_gpu_compute_partition(gpu_handle, + partition_type, AGA_MAX_STR_LEN + 1); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get compute partition for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + *compute_partition = + smi_to_aga_gpu_compute_partition_type(partition_type); + } + // fill memory partition type + amdsmi_ret = amdsmi_get_gpu_memory_partition(gpu_handle, + partition_type, AGA_MAX_STR_LEN + 1); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get memory partition for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + *memory_partition = + smi_to_aga_gpu_memory_partition_type(partition_type); + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_get_gpu_partition_id (aga_gpu_handle_t gpu_handle, uint32_t *partition_id) +{ + amdsmi_status_t status; + amdsmi_kfd_info_t kfd_info; + + status = amdsmi_get_gpu_kfd_info(gpu_handle, &kfd_info); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get partition id of GPU {}, err {}", + gpu_handle, status); + return amdsmi_ret_to_sdk_ret(status); + } + *partition_id = kfd_info.current_partition_id; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_status (aga_gpu_handle_t gpu_handle, uint32_t gpu_id, + aga_gpu_spec_t *spec, aga_gpu_status_t *status) +{ + amdsmi_status_t amdsmi_ret; + amdsmi_xgmi_status_t xgmi_st; + amdsmi_od_volt_freq_data_t vc_data; + amdsmi_gpu_metrics_t metrics_info = { 0 }; + + if (g_gpu_metrics.find(gpu_handle) != g_gpu_metrics.end()) { + metrics_info = g_gpu_metrics[gpu_handle]; + // fill the clock status with metrics info + smi_fill_clock_status_(gpu_handle, spec, status, &metrics_info); + // fill firmware timestamp + status->fw_timestamp = metrics_info.firmware_timestamp; + if (metrics_info.throttle_status != + std::numeric_limits::max()) { + status->throttling_status = + metrics_info.throttle_status ? AGA_GPU_THROTTLING_STATUS_ON : + AGA_GPU_THROTTLING_STATUS_OFF; + } + status->xgmi_status.width = metrics_info.xgmi_link_width; + status->xgmi_status.speed = metrics_info.xgmi_link_speed; + } else { + AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // fill the PCIe status + smi_fill_pcie_status_(gpu_handle, status); + // fill the xgmi error count + amdsmi_ret = amdsmi_gpu_xgmi_error_status(gpu_handle, &xgmi_st); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get xgmi error status for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + status->xgmi_status.error_status = smi_to_aga_gpu_xgmi_error(xgmi_st); + } + // fill the voltage curve points + amdsmi_ret = amdsmi_get_gpu_od_volt_info(gpu_handle, &vc_data); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get voltage curve points for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + for (uint32_t i = 0; + (i < AGA_GPU_MAX_VOLTAGE_CURVE_POINT) && + (i < AMDSMI_NUM_VOLTAGE_CURVE_POINTS); i++) { + status->voltage_curve_point[i].point = i; + status->voltage_curve_point[i].frequency = + vc_data.curve.vc_points[i].frequency/1000000; + status->voltage_curve_point[i].voltage = + vc_data.curve.vc_points[i].voltage; + } + } + smi_fill_gpu_kfd_pid_status_(gpu_handle, gpu_id, status); + smi_fill_gpu_enumeration_id_status_(gpu_handle, status); + // TODO: oper status + // TODO: RAS status + return SDK_RET_OK; +} + +/// \brief function to get number of bad pages for GPU +/// \param[in] gpu GPU object +/// \param[out] num_bad_pages number of bad pages +/// \return SDK_RET_OK or error code in case of failure +sdk_ret_t +smi_gpu_get_bad_page_count (void *gpu_obj, + uint32_t *num_bad_pages) +{ + amdsmi_status_t amdsmi_ret; + gpu_entry *gpu = (gpu_entry *)gpu_obj; + + // get number of bad page records + amdsmi_ret = amdsmi_get_gpu_bad_page_info(gpu->handle(), + num_bad_pages, NULL); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get bad page information for GPU {}, err {}", + gpu->handle(), amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } + return SDK_RET_OK; +} + +/// \brief function to get GPU bad page records +/// \param[in] gpu GPU object +/// \param[in] num_bad_pages number of bad pages +/// \param[out] records GPU bad page records +/// \return SDK_RET_OK or error code in case of failure +sdk_ret_t +smi_gpu_get_bad_page_records (void *gpu_obj, + uint32_t num_bad_pages, + aga_gpu_bad_page_record_t *records) +{ + amdsmi_status_t amdsmi_ret; + gpu_entry *gpu = (gpu_entry *)gpu_obj; + amdsmi_retired_page_record_t *bad_pages; + + // allocate memory for bad pages + bad_pages = + (amdsmi_retired_page_record_t *)malloc( + num_bad_pages * sizeof(amdsmi_retired_page_record_t)); + if (!bad_pages) { + AGA_TRACE_ERR("Failed to allocate memory for bad page information " + "for GPU {}", gpu->key().str()); + return SDK_RET_OOM; + } + // fill bad page records + amdsmi_ret = amdsmi_get_gpu_bad_page_info(gpu->handle(), &num_bad_pages, + bad_pages); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get bad page information for GPU {}, " + "err {}", gpu->handle(), amdsmi_ret); + return amdsmi_ret_to_sdk_ret(amdsmi_ret); + } else { + for (uint32_t i = 0; i < num_bad_pages; i ++) { + records[i].key = gpu->key(); + records[i].page_address = bad_pages[i].page_address; + records[i].page_size = bad_pages[i].page_size; + records[i].page_status = + smi_to_aga_gpu_page_status(bad_pages[i].status); + } + } + // free memory + free(bad_pages); + return SDK_RET_OK; +} + +static sdk_ret_t +smi_fill_vram_usage_ (aga_gpu_handle_t gpu_handle, + aga_gpu_vram_usage_t *usage) +{ + uint64_t value_64; + amdsmi_status_t amdsmi_ret; + + amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle, + AMDSMI_MEM_TYPE_VRAM, &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get VRAM total memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->total_vram = value_64/1024/1024; + } + amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle, + AMDSMI_MEM_TYPE_VIS_VRAM, + &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get visible VRAM total memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->total_visible_vram = value_64/1024/1024; + } + amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle, + AMDSMI_MEM_TYPE_GTT, + &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GTT total memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->total_gtt = value_64/1024/1024; + } + amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle, AMDSMI_MEM_TYPE_VRAM, + &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get VRAM used memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->used_vram = value_64/1024/1024; + } + amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle, + AMDSMI_MEM_TYPE_VIS_VRAM, + &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get visible VRAM used memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->used_visible_vram = value_64/1024/1024; + } + amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle, + AMDSMI_MEM_TYPE_GTT, + &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GTT used memory GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + usage->used_gtt = value_64/1024/1024; + } + usage->free_vram = usage->total_vram - usage->used_vram; + usage->free_visible_vram = usage->total_visible_vram - + usage->used_visible_vram; + usage->free_gtt = usage->total_gtt - usage->used_gtt; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, + bool partition_capable, + uint32_t partition_id, + aga_gpu_handle_t first_partition_handle, + aga_gpu_stats_t *stats) +{ + amdsmi_status_t amdsmi_ret; + uint64_t sent, received, max_pkt_size; + amdsmi_gpu_metrics_t metrics_info = {}; + amdsmi_violation_status_t violation_status = {}; + + // fill VRAM usage + smi_fill_vram_usage_(gpu_handle, &stats->vram_usage); + // fill additional statistics from gpu metrics + if (g_gpu_metrics.find(gpu_handle) != g_gpu_metrics.end()) { + metrics_info = g_gpu_metrics[gpu_handle]; + // power and voltage + stats->avg_package_power = metrics_info.average_socket_power; + stats->package_power = metrics_info.current_socket_power; + stats->voltage.voltage = metrics_info.voltage_soc; + stats->voltage.gfx_voltage = metrics_info.voltage_gfx; + stats->voltage.memory_voltage = metrics_info.voltage_mem; + // fan speed + stats->fan_speed = metrics_info.current_fan_speed; + // activity information + stats->usage.gfx_activity = metrics_info.average_gfx_activity; + stats->usage.umc_activity = metrics_info.average_umc_activity; + stats->usage.mm_activity = metrics_info.average_mm_activity; + stats->gfx_activity_accumulated = metrics_info.gfx_activity_acc; + stats->mem_activity_accumulated = metrics_info.mem_activity_acc; + // xgmi link stats + for (uint32_t i = 0; i < AGA_GPU_MAX_XGMI_LINKS; i++) { + stats->xgmi_link_stats[i].data_read = + metrics_info.xgmi_read_data_acc[i]; + stats->xgmi_link_stats[i].data_write = + metrics_info.xgmi_write_data_acc[i]; + } + // fill violation statistics + amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}", + gpu_handle, amdsmi_ret); + // revert to populating from metrics payload + stats->violation_stats.current_accumulated_counter = + metrics_info.accumulation_counter; + stats->violation_stats.processor_hot_residency_accumulated = + metrics_info.prochot_residency_acc; + stats->violation_stats.ppt_residency_accumulated = + metrics_info.ppt_residency_acc; + stats->violation_stats.socket_thermal_residency_accumulated = + metrics_info.socket_thm_residency_acc; + stats->violation_stats.vr_thermal_residency_accumulated = + metrics_info.vr_thm_residency_acc; + stats->violation_stats.hbm_thermal_residency_accumulated = + metrics_info.hbm_thm_residency_acc; + } else { + stats->violation_stats.current_accumulated_counter = + violation_status.acc_counter; + stats->violation_stats.processor_hot_residency_accumulated = + violation_status.acc_prochot_thrm; + stats->violation_stats.ppt_residency_accumulated = + violation_status.acc_ppt_pwr; + stats->violation_stats.socket_thermal_residency_accumulated = + violation_status.acc_socket_thrm; + stats->violation_stats.vr_thermal_residency_accumulated = + violation_status.acc_vr_thrm; + stats->violation_stats.hbm_thermal_residency_accumulated = + violation_status.acc_hbm_thrm; + } + // get usage information from the metrics info for partition 0 + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { + stats->usage.vcn_activity[i] = metrics_info.vcn_activity[i]; + if (partition_capable) { + stats->usage.vcn_busy[i] = + metrics_info.xcp_stats[partition_id].vcn_busy[i]; + } else { + stats->usage.vcn_busy[i] = AMDSMI_INVALID_UINT16; + } + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG; i++) { + stats->usage.jpeg_activity[i] = metrics_info.jpeg_activity[i]; + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG_ENG_V1; i++) { + if (partition_capable) { + stats->usage.jpeg_busy[i] = + metrics_info.xcp_stats[partition_id].jpeg_busy[i]; + } else { + stats->usage.jpeg_busy[i] = AMDSMI_INVALID_UINT16; + } + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + if (partition_capable) { + stats->usage.gfx_busy_inst[i] = + metrics_info.xcp_stats[partition_id].gfx_busy_inst[i]; + } else { + stats->usage.gfx_busy_inst[i] = AMDSMI_INVALID_UINT32; + } + } + // fill the energy consumed + stats->energy_consumed = metrics_info.energy_accumulator * + g_energy_counter_resolution; + // fill temperature + stats->temperature.edge_temperature = + (float)metrics_info.temperature_edge; + stats->temperature.junction_temperature = + (float)metrics_info.temperature_hotspot; + stats->temperature.memory_temperature = + (float)metrics_info.temperature_mem; + for (uint32_t i = 0; i < AGA_GPU_MAX_HBM; i++) { + stats->temperature.hbm_temperature[i] = + (float)metrics_info.temperature_hbm[i]; + } + // pcie stats + stats->pcie_stats.replay_count = metrics_info.pcie_replay_count_acc; + stats->pcie_stats.recovery_count = + metrics_info.pcie_l0_to_recov_count_acc; + stats->pcie_stats.replay_rollover_count = + metrics_info.pcie_replay_rover_count_acc; + stats->pcie_stats.nack_sent_count = + metrics_info.pcie_nak_sent_count_acc; + stats->pcie_stats.nack_received_count = + metrics_info.pcie_nak_rcvd_count_acc; + stats->pcie_stats.bidir_bandwidth = + metrics_info.pcie_bandwidth_acc; + + // PCIe throughput initialization to invalid value + stats->pcie_stats.tx_bytes = AMDSMI_INVALID_UINT64; + stats->pcie_stats.rx_bytes = AMDSMI_INVALID_UINT64; + + amdsmi_ret = amdsmi_get_gpu_pci_throughput(gpu_handle, &sent, &received, + &max_pkt_size); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get PCIe throughput for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + stats->pcie_stats.tx_bytes = received; + stats->pcie_stats.rx_bytes = sent; + } + } else { + AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // for GPU partitions which are not the first partition, we need to get + // usage information from the first partition + // partition + if (partition_id) { + // get gfx, vcn and jpeg usage from first gpu partition + amdsmi_ret = amdsmi_get_gpu_metrics_info(first_partition_handle, + &metrics_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}", + first_partition_handle, amdsmi_ret); + } else { + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { + stats->usage.vcn_busy[i] = + metrics_info.xcp_stats[partition_id].vcn_busy[i]; + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG_ENG_V1; i++) { + stats->usage.jpeg_busy[i] = + metrics_info.xcp_stats[partition_id].jpeg_busy[i]; + } + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + stats->usage.gfx_busy_inst[i] = + metrics_info.xcp_stats[partition_id].gfx_busy_inst[i]; + } + } + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_event_read_all (aga_event_read_cb_t cb, void *ctxt) +{ + return g_smi_state.event_read(cb, ctxt); +} + +sdk_ret_t +smi_gpu_reset (aga_gpu_handle_t gpu_handle, + aga_gpu_reset_type_t reset_type) +{ + amdsmi_status_t amdsmi_ret; + amdsmi_power_cap_info_t power_cap_info; + + switch(reset_type) { + case AGA_GPU_RESET_TYPE_NONE: + // reset GPU itself + amdsmi_ret = amdsmi_reset_gpu(gpu_handle); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset GPU {}, err {}", gpu_handle, + amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_CLOCK: + // reset overdrive + amdsmi_ret = amdsmi_set_gpu_overdrive_level(gpu_handle, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset overdrive, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // setting perf level to auto seems to be reset clocks as well + amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset clocks, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_FAN: + // reset fans + amdsmi_ret = amdsmi_reset_gpu_fan(gpu_handle, 0); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset fans, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_POWER_PROFILE: + // reset power profile to bootup default + amdsmi_ret = amdsmi_set_gpu_power_profile(gpu_handle, 0, + AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset power profile, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // also reset perf level to auto + amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset perf level, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_POWER_OVERDRIVE: + // get default power overdrive + amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0, + &power_cap_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get default power cap, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // set power overdrive to default + amdsmi_ret = amdsmi_set_power_cap(gpu_handle, 0, + power_cap_info.default_power_cap); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set power cap to default, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_XGMI_ERROR: + // reset xgmi error status + amdsmi_ret = amdsmi_reset_gpu_xgmi_error(gpu_handle); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset xgmi error status, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_PERF_DETERMINISM: + // resetting perf level to "auto" resets performance determinism + amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to reset perf level, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + break; + case AGA_GPU_RESET_TYPE_COMPUTE_PARTITION: + // TODO: reset partition not yet support by amd-smi + return SDK_RET_OP_NOT_SUPPORTED; + break; + case AGA_GPU_RESET_TYPE_NPS_MODE: + // TODO: reset NPS mode + return SDK_RET_OP_NOT_SUPPORTED; + break; + default: + AGA_TRACE_ERR("unknown reset request for GPU {}", gpu_handle); + return SDK_RET_INVALID_ARG; + } + + return amdsmi_ret_to_sdk_ret(amdsmi_ret); +} + +static sdk_ret_t +smi_gpu_power_cap_update_ (aga_gpu_handle_t gpu_handle, + aga_gpu_spec_t *spec) +{ + amdsmi_status_t amdsmi_ret; + amdsmi_power_cap_info_t power_cap_info; + + // 1. get power cap range + // 2. validate the power cap is within the range + // 3. set power cap + // NOTE: power cap 0 indicates reset to default + + // step1: get power cap range + amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0, &power_cap_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get power cap, GPU {}, err {}", + gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + // step2: validate power cap + power_cap_info.min_power_cap /= 1000000; + power_cap_info.max_power_cap /= 1000000; + if ((spec->gpu_power_cap < power_cap_info.min_power_cap) || + (spec->gpu_power_cap > power_cap_info.max_power_cap)) { + AGA_TRACE_ERR("Power cap {} is out of supported range, GPU {}, " + "allowed range {}-{}", spec->gpu_power_cap, + gpu_handle, power_cap_info.min_power_cap, + power_cap_info.max_power_cap); + return sdk_ret_t(SDK_RET_INVALID_ARG, + ERR_CODE_SMI_GPU_POWER_CAP_OUT_OF_RANGE); + } + // step3: set power cap + amdsmi_ret = amdsmi_set_power_cap(gpu_handle, 0, + (spec->gpu_power_cap * 1000000)); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set power cap, GPU {}, err {}", + gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_update (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, + uint64_t upd_mask) +{ + sdk_ret_t ret; + std::ofstream of; + std::string dev_path; + amdsmi_status_t amdsmi_ret; + amdsmi_clk_type_t clock_type; + amdsmi_dev_perf_level_t perf_level; + + // performance level has to be set to manual (default is auto) to configure + // the following list of attributes to non default values + // 1. GPU overdrive level + // 2. memory overdirve level + + // set compute partition type; we return after this operation as it doesn't + // make sense to update other fields along with compute partition type + if (upd_mask & AGA_GPU_UPD_COMPUTE_PARTITION_TYPE) { + amdsmi_ret = amdsmi_set_gpu_compute_partition(gpu_handle, + aga_to_smi_gpu_compute_partition_type( + spec->compute_partition_type)); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set GPU compute partition type to {}, " + "GPU {}, err {}", spec->compute_partition_type, + gpu_handle, amdsmi_ret); + } + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + + // set memory partition type; we return after this operation as it doesn't + // make sense to update other fields along with memory partition type + if (upd_mask & AGA_GPU_UPD_MEMORY_PARTITION_TYPE) { + amdsmi_ret = amdsmi_set_gpu_memory_partition(gpu_handle, + aga_to_smi_gpu_memory_partition_type( + spec->memory_partition_type)); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set GPU memory partition type to {}, " + "GPU {}, err {}", spec->memory_partition_type, + gpu_handle, amdsmi_ret); + } + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + + // set performance level to manual if required + if (upd_mask & AGA_GPU_UPD_OVERDRIVE_LEVEL) { + amdsmi_ret = amdsmi_get_gpu_perf_level(gpu_handle, &perf_level); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get performance level GPU {}, err {}", + gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + // if performance level is not manual already, set it to manual + if (perf_level != AMDSMI_DEV_PERF_LEVEL_MANUAL) { + amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, + AMDSMI_DEV_PERF_LEVEL_MANUAL); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set performance level to manual, " + "GPU {}, err {}", gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + } + } + // overdrive update + if (upd_mask & AGA_GPU_UPD_OVERDRIVE_LEVEL) { + amdsmi_ret = amdsmi_set_gpu_overdrive_level(gpu_handle, + spec->overdrive_level); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set overdrive level, GPU {}, err {}", + gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + } + // system clock frequence range update + if (upd_mask & AGA_GPU_UPD_CLOCK_FREQ_RANGE) { + for (uint32_t i = 0; i < AGA_GPU_NUM_CFG_CLOCK_TYPES; i++) { + ret = aga_to_smi_gpu_clock_type(spec->clock_freq[i].clock_type, + &clock_type); + if (ret != SDK_RET_OK) { + AGA_TRACE_ERR("Invalid clock type {} specified, GPU {}", + spec->clock_freq[i].clock_type, gpu_handle); + return SDK_RET_INVALID_ARG; + } + amdsmi_ret = amdsmi_set_gpu_clk_range(gpu_handle, + spec->clock_freq[i].lo, spec->clock_freq[i].hi, + clock_type); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set clock {} frequency range, GPU {}, " + "range {}-{}, err {}", + spec->clock_freq[i].clock_type, gpu_handle, + spec->clock_freq[i].lo, spec->clock_freq[i].hi, + amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + } + } + // power cap update + if (upd_mask & AGA_GPU_UPD_POWER_CAP) { + ret = smi_gpu_power_cap_update_(gpu_handle, spec); + if (ret != SDK_RET_OK) { + return ret; + } + } + // performance level update + if (upd_mask & AGA_GPU_UPD_PERF_LEVEL) { + perf_level = aga_to_smi_gpu_perf_level(spec->perf_level); + amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, perf_level); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set performance level to {}, " + "GPU {}, err {}", perf_level, gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + } + // fan speed update + if (upd_mask & AGA_GPU_UPD_FAN_SPEED) { + amdsmi_ret = amdsmi_set_gpu_fan_speed(gpu_handle, 0, + (int64_t)spec->fan_speed); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to set fan speed to {}, GPU {}, err {}", + spec->fan_speed, gpu_handle, amdsmi_ret); + return (amdsmi_ret_to_sdk_ret(amdsmi_ret)); + } + } + // TODO: RAS spec update + return SDK_RET_OK; +} + +/// \brief callback function to be used to fill topology information between +/// two GPUS +/// \param[in] obj GPU object returned by walk function +/// \param[in] ctxt opaque context passed to the callback function +/// \return false in case walk should continue or true otherwise +static inline bool +gpu_topo_walk_cb (void *obj, void *ctxt) +{ + gpu_entry *gpu1, *gpu2; + amdsmi_status_t amdsmi_ret; + static std::string name = "GPU"; + gpu_topo_walk_ctxt_t *walk_ctxt; + aga_device_topology_info_t *info; + + gpu2 = (gpu_entry *)obj; + walk_ctxt = (gpu_topo_walk_ctxt_t *)ctxt; + gpu1 = walk_ctxt->gpu; + info = walk_ctxt->info; + + if (gpu1->handle() != gpu2->handle()) { + info->peer_device[walk_ctxt->count].peer_device.type = + AGA_DEVICE_TYPE_GPU; + strcpy(info->peer_device[walk_ctxt->count].peer_device.name, + (name + std::to_string(gpu2->id())).c_str()); + amdsmi_ret = + amdsmi_topo_get_link_type(gpu1->handle(), gpu2->handle(), + &info->peer_device[walk_ctxt->count].num_hops, + (amdsmi_link_type_t *) + &info->peer_device[walk_ctxt->count].connection.type); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get link type between gpus {} and {}, " + "err {}", gpu1->handle(), gpu2->handle(), amdsmi_ret); + // in case of error set num hops to 0xffff and IO link type to + // none + info->peer_device[walk_ctxt->count].num_hops = 0xffff; + info->peer_device[walk_ctxt->count].connection.type = + AGA_IO_LINK_TYPE_NONE; + } + amdsmi_ret = amdsmi_topo_get_link_weight(gpu1->handle(), gpu2->handle(), + &info->peer_device[walk_ctxt->count].link_weight); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get weight for link between gpus {}" + "and {}, err {}", gpu1->handle(), gpu2->handle(), + amdsmi_ret); + // in case of error set link weight to 0xffff + info->peer_device[walk_ctxt->count].link_weight = 0xffff; + } + info->peer_device[walk_ctxt->count].valid = true; + walk_ctxt->count++; + } + return false; +} + +sdk_ret_t +smi_gpu_fill_device_topology (aga_gpu_handle_t gpu_handle, + aga_device_topology_info_t *info) +{ + gpu_entry *gpu; + gpu_topo_walk_ctxt_t ctxt; + + gpu = gpu_db()->find(gpu_handle); + if (gpu == NULL) { + AGA_TRACE_ERR("Failed to find GPU {}", gpu_handle); + return SDK_RET_ENTRY_NOT_FOUND; + } + + ctxt.count = 0; + ctxt.info = info; + ctxt.gpu = gpu; + + // walk gpu db and fill device topology + gpu_db()->walk_handle_db(gpu_topo_walk_cb, &ctxt); + return SDK_RET_OK; +} + +/// \brief function to get aga_obj_key_t for a given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] key aga_obj_key_t of the GPU +static sdk_ret_t +smi_gpu_uuid_get (aga_gpu_handle_t gpu_handle, aga_obj_key_t *key) +{ + amdsmi_status_t status; + char uuid_rem[20]; + char uuid[AMDSMI_GPU_UUID_SIZE]; + uint32_t uuid_len = AMDSMI_GPU_UUID_SIZE; + + // get uuid from amdsmi + status = amdsmi_get_gpu_device_uuid(gpu_handle, &uuid_len, uuid); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get uuid of GPU {}, err {}", + gpu_handle, status); + return amdsmi_ret_to_sdk_ret(status); + } + // amdsmi returns a string containing the uuid of the GPU (ex: + // 2eff74a1-0000-1000-80fe-9cea14a6b148); to derive the aga_obj_key_t from + // it we scan the string and construct our aga_obj_key_t + sscanf(uuid, "%x-%hx-%hx-%hx-%s", (uint32_t *)&key->id[0], + (uint16_t *)&key->id[4], (uint16_t *)&key->id[6], + (uint16_t *)&key->id[8], uuid_rem); + *(uint32_t *)&key->id[0] = htonl(*(uint32_t *)&key->id[0]); + *(uint16_t *)&key->id[4] = htons(*(uint16_t *)&key->id[4]); + *(uint16_t *)&key->id[6] = htons(*(uint16_t *)&key->id[6]); + *(uint16_t *)&key->id[8] = htons(*(uint16_t *)&key->id[8]); + sscanf(uuid_rem, "%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx", &key->id[10], + &key->id[11], &key->id[12], &key->id[13], &key->id[14], + &key->id[15]); + return SDK_RET_OK; +} + +sdk_ret_t +smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles, + aga_obj_key_t *gpu_keys) +{ + sdk_ret_t ret; + uint32_t num_procs; + uint32_t num_sockets; + amdsmi_status_t status; + processor_type_t proc_type; + amdsmi_socket_handle socket_handles[AGA_MAX_SOCKET]; + aga_gpu_handle_t proc_handles[AGA_MAX_PROCESSORS_PER_SOCKET]; + + if (!num_gpus) { + return SDK_RET_ERR; + } + *num_gpus = 0; + // get the socket count available in the system + status = amdsmi_get_socket_handles(&num_sockets, NULL); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get number of sockets from amd smi library, " + "err {}", status); + return amdsmi_ret_to_sdk_ret(status); + } + // get the socket handles in the system + status = amdsmi_get_socket_handles(&num_sockets, &socket_handles[0]); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get socket handles from amd smi library, " + "err {}", status); + return amdsmi_ret_to_sdk_ret(status); + } + for (uint32_t i = 0; i < num_sockets; i++) { + // for each socket get the number of processors + status = amdsmi_get_processor_handles(socket_handles[i], + &num_procs, NULL); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get number of processors in socket handle " + "{} from amd smi library, err {}", socket_handles[i], + status); + return amdsmi_ret_to_sdk_ret(status); + } + // for each socket get the processor handles + status = amdsmi_get_processor_handles(socket_handles[i], + &num_procs, &proc_handles[0]); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get processor handles in socket handle " + "{} from amd smi library, err {}", socket_handles[i], + status); + return amdsmi_ret_to_sdk_ret(status); + } + // get uuids of each GPU + for (uint32_t j = 0; j < num_procs; j++) { + status = amdsmi_get_processor_type(proc_handles[j], &proc_type); + if (unlikely(status != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get processor type of processor {}" + " from amd smi library, err {}", proc_handles[j], + status); + return amdsmi_ret_to_sdk_ret(status); + } + if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + gpu_handles[*num_gpus] = proc_handles[j]; + if (gpu_keys) { + ret = smi_gpu_uuid_get(proc_handles[j], + &gpu_keys[*num_gpus]); + if (ret != SDK_RET_OK) { + AGA_TRACE_ERR("GPU discovery failed due to error in " + "getting UUID of GPU {}", + proc_handles[j]); + return ret; + } + } + (*num_gpus)++; + } + } + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, + aga_gpu_status_t *status) +{ + uint64_t value_64; + amdsmi_fw_info_t fw_info; + amdsmi_status_t amdsmi_ret; + amdsmi_vbios_info_t vbios_info; + amdsmi_board_info_t board_info; + amdsmi_driver_info_t driver_info; + amdsmi_virtualization_mode_t mode; + + // fill immutable attributes in spec + // fill gpu and memory clock frequencies + smi_fill_gpu_clock_frequency_spec_(gpu_handle, spec); + + // fill immutable attributes in status + // fill the GPU serial number + amdsmi_ret = amdsmi_get_gpu_board_info(gpu_handle, &board_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get serial number for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // fill the virtualization mode + amdsmi_ret = amdsmi_get_gpu_virtualization_mode(gpu_handle, &mode); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get virtualization mode for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + status->virtualization_mode = smi_to_aga_virtualization_mode(mode); + } + memcpy(status->serial_num, board_info.product_serial, AGA_MAX_STR_LEN); + // fill the GPU card series + memcpy(status->card_series, board_info.product_name, AGA_MAX_STR_LEN); + // fill the GPU vendor information + memcpy(status->card_vendor, board_info.manufacturer_name, AGA_MAX_STR_LEN); + // fill the GPU card model + memcpy(status->card_model, board_info.model_number, AGA_MAX_STR_LEN); + // fill the driver version + amdsmi_ret = amdsmi_get_gpu_driver_info(gpu_handle, &driver_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get system driver information, GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + memcpy(status->driver_version, driver_info.driver_version, AGA_MAX_STR_LEN); + + // fill the vbios version + amdsmi_ret = amdsmi_get_gpu_vbios_info(gpu_handle, &vbios_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get vbios version for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + strncpy(status->vbios_version, vbios_info.version, AGA_MAX_STR_LEN); + strncpy(status->vbios_part_number, vbios_info.part_number, + AGA_MAX_STR_LEN); + // sku should be retrieved from vbios version + gpu_get_sku_from_vbios_(status->card_sku, vbios_info.part_number); + } + // fill the firmware version + amdsmi_ret = amdsmi_get_fw_info(gpu_handle, &fw_info); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get firmware version for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + memset(status->fw_version, 0, + sizeof(aga_gpu_fw_version_t) * AGA_GPU_MAX_FIRMWARE_VERSION); + for (uint32_t i = 0; i < fw_info.num_fw_info; i++) { + fill_gpu_fw_version_(&status->fw_version[i], + fw_info.fw_info_list[i].fw_id, + fw_info.fw_info_list[i].fw_version); + } + status->num_fw_versions = fw_info.num_fw_info; + } + // fill the memory vendor + amdsmi_ret = amdsmi_get_gpu_vram_vendor(gpu_handle, status->memory_vendor, + AGA_MAX_STR_LEN); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get memory vendor for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } + // fill vram status + smi_fill_vram_status_(gpu_handle, &status->vram_status); + // fill GPU BDF + amdsmi_ret = amdsmi_get_gpu_bdf_id(gpu_handle, &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get PCIe bus id for GPU {}, err {}", + gpu_handle, amdsmi_ret); + } else { + // convert PCIe bus to XXXX.XX.XX.X format + snprintf(status->pcie_status.pcie_bus_id, AGA_MAX_STR_LEN, + "%04X:%02X:%02X.%X", + ((uint32_t)((value_64 >> 32) & 0xffffffff)), + ((uint32_t)((value_64 >> 8) & 0xff)), + ((uint32_t)((value_64 >> 3) & 0x1f)), + ((uint32_t)(value_64 & 0x7))); + } + // get energy counter resolution if not already set + if (g_energy_counter_resolution == 0.0) { + amdsmi_ret = amdsmi_get_energy_count(gpu_handle, &value_64, + &g_energy_counter_resolution, &value_64); + if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { + AGA_TRACE_ERR("Failed to get energy count for GPU {}, err {}", + gpu_handle, amdsmi_ret); + // in case of failure use the default value + g_energy_counter_resolution = AMDSMI_COUNTER_RESOLUTION; + } + } + return SDK_RET_OK; +} + +static inline std::string +timestamp_string_from_cper_timestamp (amdsmi_cper_timestamp_t *ts) +{ + uint32_t full_year; + std::ostringstream oss; + + // assuming year is offset from 2000 + full_year = 2000 + ts->year; + + oss << std::setfill('0') << std::setw(4) << full_year << "-" + << std::setw(2) << static_cast(ts->month) << "-" + << std::setw(2) << static_cast(ts->day) << " " + << std::setw(2) << static_cast(ts->hours) << ":" + << std::setw(2) << static_cast(ts->minutes) << ":" + << std::setw(2) << static_cast(ts->seconds); + + return oss.str(); +} + +sdk_ret_t +smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, aga_cper_info_t *info) +{ + char *cper_data; + char *cper_buffer; + uint64_t cursor = 0; + uint32_t severity_mask; + amdsmi_status_t afid_status; + uint64_t total_cper_entries = 0; + uint64_t buf_size = CPER_BUF_SIZE; + uint32_t prev_cper_record_size = 0; + uint64_t num_cper_hdr = AGA_GPU_MAX_CPER_ENTRY; + amdsmi_status_t status = AMDSMI_STATUS_MORE_DATA; + amdsmi_cper_hdr_t *cper_hdrs[AGA_GPU_MAX_CPER_ENTRY]; + + // set severity mask + switch (severity) { + case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED); + break; + case AGA_CPER_SEVERITY_FATAL: + severity_mask = (1 << AMDSMI_CPER_SEV_FATAL); + break; + case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + default: + severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED) | + (1 << AMDSMI_CPER_SEV_FATAL) | + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + break; + } + // allocate memory for CPER data + cper_data = (char *)malloc(buf_size); + // cper_buffer is used to keep track of each individual record + cper_buffer = cper_data; + while (status == AMDSMI_STATUS_MORE_DATA) { + // get CPER entries + status = amdsmi_get_gpu_cper_entries(gpu_handle, severity_mask, + cper_data, &buf_size, cper_hdrs, &num_cper_hdr, &cursor); + if ((status != AMDSMI_STATUS_SUCCESS) && + (status != AMDSMI_STATUS_MORE_DATA)) { + AGA_TRACE_ERR("Failed to get CPER entries for GPU {}, err {}", + gpu_handle, status); + // free allocated memory + free(cper_data); + return amdsmi_ret_to_sdk_ret(status); + } + for (uint64_t i = 0; + i < num_cper_hdr && total_cper_entries < AGA_GPU_MAX_CPER_ENTRY; + i++, total_cper_entries++) { + auto cper_entry = &info->cper_entry[info->num_cper_entry++]; + + cper_entry->record_id = std::string(cper_hdrs[i]->record_id); + cper_entry->severity = + smi_to_aga_cper_severity(cper_hdrs[i]->error_severity); + cper_entry->revision = cper_hdrs[i]->revision; + if (cper_hdrs[i]->cper_valid_bits.valid_bits.timestamp) { + cper_entry->timestamp = + timestamp_string_from_cper_timestamp( + &cper_hdrs[i]->timestamp); + } + cper_entry->creator_id = std::string(cper_hdrs[i]->creator_id); + cper_entry->notification_type = + smi_to_aga_cper_notification_type(cper_hdrs[i]->notify_type); + // get AMD field ids from the cper record + cper_buffer += prev_cper_record_size; + // initialize num_af_id to be the size of the array + cper_entry->num_af_id = AGA_GPU_MAX_AF_ID_PER_CPER; + afid_status = amdsmi_get_afids_from_cper(cper_buffer, + cper_hdrs[i]->record_length, cper_entry->af_id, + &cper_entry->num_af_id); + if (afid_status != AMDSMI_STATUS_SUCCESS) { + cper_entry->num_af_id = 0; + AGA_TRACE_ERR("Failed to get AMD field id for CPER entry for " + "GPU {}, err {}", gpu_handle, status); + } + // update prev_cper_record_size + prev_cper_record_size = cper_hdrs[i]->record_length; + } + } + + // free allocated memory + free(cper_data); + return SDK_RET_OK; +} + +} // namespace aga diff --git a/sw/nic/gpuagent/api/smi/smi_api_mock.cc b/sw/nic/gpuagent/api/smi/smi_api_mock.cc index 1e86361..2b7204f 100644 --- a/sw/nic/gpuagent/api/smi/smi_api_mock.cc +++ b/sw/nic/gpuagent/api/smi/smi_api_mock.cc @@ -277,6 +277,37 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { stats->usage.gfx_busy_inst[i] = distr(gen) % 100 ; } + // fill violation stats + stats->violation_stats.current_accumulated_counter = 123456 + distr(gen) - distr(gen); + stats->violation_stats.processor_hot_residency_accumulated = 23456 + distr(gen) - distr(gen); + stats->violation_stats.ppt_residency_accumulated = 34567 + distr(gen) - distr(gen); + stats->violation_stats.socket_thermal_residency_accumulated = 45678 + distr(gen) - distr(gen); + stats->violation_stats.vr_thermal_residency_accumulated = 56789 + distr(gen) - distr(gen); + stats->violation_stats.hbm_thermal_residency_accumulated = 67890 + distr(gen) - distr(gen); + stats->violation_stats.processor_hot_residency_percentage = distr(gen) % 100; + stats->violation_stats.ppt_residency_percentage = distr(gen) % 100; + stats->violation_stats.socket_thermal_residency_percentage = distr(gen) % 100; + stats->violation_stats.vr_thermal_residency_percentage = distr(gen) % 100; + stats->violation_stats.hbm_thermal_residency_percentage = distr(gen) % 100; + + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + stats->violation_stats.gfx_clk_below_host_limit_power_accumulated[i] = + 1234 + distr(gen) - distr(gen); + stats->violation_stats.gfx_clk_below_host_limit_thermal_accumulated[i] = + 2345 + distr(gen) - distr(gen); + stats->violation_stats.gfx_low_utilization_accumulated[i] = + 3456 + distr(gen) - distr(gen); + stats->violation_stats.gfx_clk_below_host_limit_total_accumulated[i] = + 4567 + distr(gen) - distr(gen); + stats->violation_stats.gfx_clk_below_host_limit_power_percentage[i] = + distr(gen) % 100; + stats->violation_stats.gfx_clk_below_host_limit_thermal_percentage[i] = + distr(gen) % 100; + stats->violation_stats.gfx_low_utilization_percentage[i] = + distr(gen) % 100; + stats->violation_stats.gfx_clk_below_host_limit_total_percentage[i] = + distr(gen) % 100; + } return SDK_RET_OK; } diff --git a/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig b/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig new file mode 100644 index 0000000..1e86361 --- /dev/null +++ b/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig @@ -0,0 +1,706 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +//---------------------------------------------------------------------------- +/// +/// \file +/// smi layer mock API definitions +/// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include "nic/sdk/include/sdk/base.hpp" +#include "nic/sdk/lib/event_thread/event_thread.hpp" +#include "nic/gpuagent/core/aga_core.hpp" +#include "nic/gpuagent/core/ipc_msg.hpp" +#include "nic/gpuagent/core/trace.hpp" +#include "nic/gpuagent/api/aga_state.hpp" +#include "nic/gpuagent/api/include/aga_gpu.hpp" +#include "nic/gpuagent/api/include/aga_init.hpp" +#include "nic/gpuagent/api/smi/smi_api.hpp" +#include "nic/gpuagent/api/smi/smi_events.hpp" +#include "nic/gpuagent/api/smi/smi_api_mock_impl.hpp" + +/// initial delay (in seconds) after which event monitoring starts +#define AGA_SMI_EVENT_MONITOR_START_DELAY 10.0 +/// event monitoring frequency (in seconds) +#define AGA_SMI_EVENT_MONITOR_INTERVAL 3.0 + +namespace aga { + +/// event database indexed by processor handle +unordered_map g_gpu_event_db; +/// event monitor thread instance +sdk::event_thread::event_thread *g_event_monitor_thread; + +/// \brief fill clock frequency ranges of the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] spec spec to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_clock_frequency_spec_ (aga_gpu_handle_t gpu_handle, + aga_gpu_spec_t *spec) +{ + // fill sClock spec + spec->clock_freq[0].clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM; + spec->clock_freq[0].lo = 500; + spec->clock_freq[0].hi = 1700; + // fill mClock spec + spec->clock_freq[1].clock_type = AGA_GPU_CLOCK_TYPE_MEMORY; + spec->clock_freq[1].lo = 400; + spec->clock_freq[1].hi = 1600; + // fill video clock spec + spec->clock_freq[2].clock_type = AGA_GPU_CLOCK_TYPE_VIDEO; + spec->clock_freq[2].lo = 914; + spec->clock_freq[2].hi = 1333; + // fill data clock spec + spec->clock_freq[3].clock_type = AGA_GPU_CLOCK_TYPE_DATA; + spec->clock_freq[3].lo = 711; + spec->clock_freq[3].hi = 1143; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, + aga_gpu_status_t *status) +{ + // no need to do anything for mock + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_spec (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec) +{ + spec->overdrive_level = 0; + spec->perf_level = AGA_GPU_PERF_LEVEL_AUTO; + + // fill gpu and memory clock frequencies + smi_fill_gpu_clock_frequency_spec_(gpu_handle, spec); + spec->compute_partition_type = AGA_GPU_COMPUTE_PARTITION_TYPE_SPX; + return SDK_RET_OK; +} + +/// \brief fill GPU enumeration ids info using the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_enumeration_id_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_status_t *status) +{ + status->kfd_id = 58934; + status->node_id = 3; + status->drm_render_id = 128; + status->drm_card_id = 3; + return SDK_RET_OK; +} + +/// \brief fill list of pids using the given GPU +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_gpu_kfd_pid_status_ (aga_gpu_handle_t gpu_handle, + aga_gpu_status_t *status) +{ + // TODO: fill kfd pids when this data is available + return SDK_RET_OK; +} + +/// \brief function to format firmware version +/// \param[out] fw_version firmware component/version after formatting +/// \param[in] block firmware component name +/// \param[in] version firmware version +/// \return none +static void +fill_gpu_fw_version_ (aga_gpu_fw_version_t *fw_version, const char *block, + const char *version) +{ + strncpy(fw_version->firmware, block, AGA_MAX_STR_LEN); + strncpy(fw_version->version, version, AGA_MAX_STR_LEN); +} + +/// \brief fill supported and current frequencies of system clocks +/// \param[in] gpu_handle GPU handle +/// \param[out] status operational status to be filled +/// \return SDK_RET_OK or error code in case of failure +static sdk_ret_t +smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, aga_gpu_status_t *status) +{ + for (uint32_t i = 0; i < AGA_GPU_MAX_CLOCK; i++) { + auto clock_status = &status->clock_status[i]; + if (i < AGA_GPU_GFX_MAX_CLOCK) { + // gfx clock + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM; + clock_status->frequency = 138 + i; + clock_status->locked = (i % 2); + clock_status->deep_sleep = + (clock_status->frequency <= 140) ? true : false; + } else if (i < (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK)) { + // memory clock + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_MEMORY; + clock_status->frequency = 900; + clock_status->locked = false; + clock_status->deep_sleep = false; + } else if (i < (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK + + AGA_GPU_VIDEO_MAX_CLOCK)) { + // video clock + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_VIDEO; + clock_status->frequency = 29; + clock_status->locked = false; + clock_status->deep_sleep = true; + } else { + // data clock + clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DATA; + clock_status->frequency = 22; + clock_status->locked = false; + clock_status->deep_sleep = true; + } + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_status (aga_gpu_handle_t gpu_handle, uint32_t gpu_id, + aga_gpu_spec_t *spec, aga_gpu_status_t *status) +{ + status->index = gpu_id; + status->handle = gpu_handle; + // fill the GPU serial number + strncpy(status->serial_num, "PCB046982-0071", AGA_MAX_STR_LEN); + // fill the GPU card series + strncpy(status->card_series, "AMD INSTINCT MI300 (MCM) OAM AC MBA MSFT", + AGA_MAX_STR_LEN); + // fill the GPU card model + strncpy(status->card_model, "102-G30211-00", AGA_MAX_STR_LEN); + // fill the GPU vendor information + strncpy(status->card_vendor, "Advanced Micro Devices, Inc. [AMD/ATI]", + AGA_MAX_STR_LEN); + // fill the driver version + strncpy(status->driver_version, "7.0.0", AGA_MAX_STR_LEN); + // fill the vbios part number + strncpy(status->vbios_part_number, "113-D65205-107", AGA_MAX_STR_LEN); + // fill the vbios version + strncpy(status->vbios_version, "022.040.003.041.000001", AGA_MAX_STR_LEN); + // fill sku + strncpy(status->card_sku, "D65205", AGA_MAX_STR_LEN); + // fill the firmware version + fill_gpu_fw_version_(&status->fw_version[1], "MEC2", "78"); + fill_gpu_fw_version_(&status->fw_version[2], "RLC", "17"); + fill_gpu_fw_version_(&status->fw_version[4], "SDMA2", "8"); + fill_gpu_fw_version_(&status->fw_version[7], "TA_RAS", "27.00.01.60"); + fill_gpu_fw_version_(&status->fw_version[8], "TA_XGMI", "32.00.00.19"); + fill_gpu_fw_version_(&status->fw_version[9], "VCN", "0x0110101b"); + // fill the memory vendor + strncpy(status->memory_vendor, "hynix", AGA_MAX_STR_LEN); + smi_fill_clock_status_(gpu_handle, status); + // fill the PCIe bus id + strncpy(status->pcie_status.pcie_bus_id, "0000:59:00.0", AGA_MAX_STR_LEN); + status->pcie_status.slot_type = AGA_PCIE_SLOT_TYPE_OAM; + status->pcie_status.width = 16; + status->pcie_status.max_width = 16; + status->pcie_status.speed = 16; + status->pcie_status.max_speed = 32; + status->pcie_status.bandwidth = 315; + // fill VRAM status + status->vram_status.type = AGA_VRAM_TYPE_HBM; + strcpy(status->vram_status.vendor, "hynix"); + status->vram_status.size = 196592; + // fill the xgmi error count + status->xgmi_status.error_status = AGA_GPU_XGMI_STATUS_NO_ERROR; + // fill total memory + // fill kfd pid info + smi_fill_gpu_kfd_pid_status_(gpu_handle, status); + status->partition_id = 0; + smi_fill_gpu_enumeration_id_status_(gpu_handle, status); + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle, + bool partition_capable, + uint32_t partition_id, + aga_gpu_handle_t first_partition_handle, + aga_gpu_stats_t *stats) +{ + std::random_device rd; // obtain a random number from hardware + std::mt19937 gen(rd()); // seed the generator + std::uniform_int_distribution<> distr(0, 90); + + // fill the avg package power + stats->avg_package_power = 90 + distr(gen) - distr(gen); + // fill the current package power + stats->package_power = 90 + distr(gen) - distr(gen); + // fill the GPU usage + stats->usage.gfx_activity = distr(gen) % 100; + // fill VRAM usage + stats->vram_usage.total_vram = 196592; + stats->vram_usage.used_vram = 1273; + stats->vram_usage.free_vram = + stats->vram_usage.total_vram - stats->vram_usage.used_vram; + stats->vram_usage.total_visible_vram = 196592; + stats->vram_usage.used_visible_vram = 1273; + stats->vram_usage.free_visible_vram = + stats->vram_usage.total_visible_vram - + stats->vram_usage.used_visible_vram; + stats->vram_usage.total_gtt = 128716; + stats->vram_usage.used_gtt = 20; + stats->vram_usage.free_gtt = + stats->vram_usage.total_gtt - stats->vram_usage.used_gtt; + // fill the PCIe stats + ++stats->pcie_stats.replay_count; + ++stats->pcie_stats.tx_bytes; + ++stats->pcie_stats.recovery_count; + ++stats->pcie_stats.replay_rollover_count; + ++stats->pcie_stats.nack_sent_count; + ++stats->pcie_stats.nack_received_count; + ++stats->pcie_stats.rx_bytes; + ++stats->pcie_stats.tx_bytes; + ++stats->pcie_stats.bidir_bandwidth; + // fill the energy consumed + stats->energy_consumed = 25293978861568 + distr(gen) - distr(gen); + for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + stats->usage.gfx_busy_inst[i] = distr(gen) % 100 ; + } + return SDK_RET_OK; +} + +typedef struct gpu_event_cb_ctxt_s { + aga_event_read_cb_t cb; + void *ctxt; +} gpu_event_cb_ctxt_t; + +// generate one event for each GPU +static inline bool +gpu_event_read_cb (void *obj, void *ctxt) +{ + timespec_t ts; + aga_event_t event = {}; + aga_event_id_t event_id; + void *event_buffer = event_get(); + gpu_entry *gpu = (gpu_entry *)obj; + gpu_event_cb_ctxt_t *walk_ctxt = (gpu_event_cb_ctxt_t *)ctxt; + + event_id = event_buffer_get_event_id(event_buffer, 0); + + // get current time + clock_gettime(CLOCK_REALTIME, &ts); + // fill the event information + event.id = event_id; + event.timestamp = ts; + event.gpu = gpu->key(); + strncpy(event.message, event_buffer_get_message(event_buffer, 0), + AGA_MAX_EVENT_STR); + event.message[AGA_MAX_EVENT_STR] = '\0'; + // call the callback now + walk_ctxt->cb(&event, walk_ctxt->ctxt); + return false; +} + +sdk_ret_t +event_read (aga_event_read_cb_t cb, void *ctxt) +{ + gpu_event_cb_ctxt_t event_ctxt; + + event_ctxt.cb = cb; + event_ctxt.ctxt = ctxt; + gpu_db()->walk(gpu_event_read_cb, &event_ctxt); + return SDK_RET_OK; +} + +sdk_ret_t +smi_event_read_all (aga_event_read_cb_t cb, void *ctxt) +{ + return event_read(cb, ctxt); +} + +sdk_ret_t +event_monitor_init (void) +{ + gpu_event_record_t null_event_record = {}; + + // initialize the s/w state + for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) { + SDK_SPINLOCK_INIT(&g_gpu_event_db[gpu_get_handle(d)].slock, + PTHREAD_PROCESS_SHARED); + } + return SDK_RET_OK; +} + +sdk_ret_t +cleanup_event_listeners (vector& listeners) +{ + aga_event_listener_info_t listener; + + for (auto it = listeners.begin(); it != listeners.end(); it++) { + listener = *it; + + // if client context of one gpu is inactive, + // we should erase the client context from all gpus + // and all events related to this gRPC stream before + // waking up the front end, otherwise the client contexts + // stored for other gpus for the same subscribe request + // will eventually lead to agent crash + + for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) { + // lock the event state for this device + SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + for (uint32_t e = (AGA_EVENT_ID_NONE + 1); e <= AGA_EVENT_ID_MAX; + e++) { + auto& event_record = + g_gpu_event_db[gpu_get_handle(d)].event_map[(aga_event_id_t)e]; + // erase the client + event_record.client_info.client_set.erase(listener.client_ctxt); + } + // unlock the event state for this device + SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + } + // wakeup the front end thread so it can exit + listener.client_ctxt->client_inactive = true; + AGA_TRACE_INFO("Signaling frontend gRPC thread to quit, client {}, " + "client ctxt {}, stream {}", + listener.client_ctxt->client.c_str(), + (void *)listener.client_ctxt, + listener.client_ctxt->stream); + pthread_cond_signal(&listener.client_ctxt->cond); + } + return SDK_RET_OK; +} + +static sdk_ret_t +handle_events (uint32_t num_events, void *event_buffer) +{ + sdk_ret_t ret; + timespec_t ts; + gpu_entry *gpu; + aga_gpu_handle_t gpu_handle; + aga_event_t event = {}; + aga_event_id_t event_id; + aga_event_client_ctxt_t *client_ctxt; + aga_event_listener_info_t inactive_listener; + vector inactive_listeners; + + // get current time + clock_gettime(CLOCK_REALTIME, &ts); + // start processing all the events + for (uint32_t i = 0; i < num_events; i++) { + gpu_handle = event_buffer_get_gpu_handle(event_buffer, i); + gpu = gpu_db()->find(gpu_handle); + if (gpu == NULL) { + continue; + } + event_id = event_buffer_get_event_id(event_buffer, i); + auto& event_map = g_gpu_event_db[gpu_handle].event_map; + + // lock the event state for this device + SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_handle].slock); + // update our event state + auto& event_record = event_map[event_id]; + event_record.timestamp = ts; + strncpy(event_record.message, event_buffer_get_message(event_buffer, i), + AGA_MAX_EVENT_STR); + event_record.message[AGA_MAX_EVENT_STR] = '\0'; + // fill the event record + event.id = event_id; + event.timestamp = ts; + event.gpu = gpu->key(); + strncpy(event.message, event_buffer_get_message(event_buffer, i), + AGA_MAX_EVENT_STR); + event.message[AGA_MAX_EVENT_STR] = '\0'; + // walk thru all the clients that are interested in this event and + // notify them + for (auto client_set_it = event_record.client_info.client_set.begin(); + client_set_it != event_record.client_info.client_set.end(); + client_set_it++) { + client_ctxt = *client_set_it; + // invoke the event notification callback + ret = client_ctxt->notify_cb(&event, *client_set_it); + if (unlikely(ret != SDK_RET_OK)) { + // add to list of clients not reachable + inactive_listener.gpu_id = gpu->id(); + inactive_listener.event = event_id; + inactive_listener.client_ctxt = *client_set_it; + inactive_listeners.push_back(inactive_listener); + } + } + // unlock the event state maintained for this device + SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_handle].slock); + } + // handle all the dead clients now + cleanup_event_listeners(inactive_listeners); + return SDK_RET_OK; +} + +static void +event_monitor_timer_cb (sdk::event_thread::timer_t *timer) +{ + // handle all the events + handle_events(1, event_get()); +} + +/// \brief process an event subscribe request from client +/// \param[in] req pointer to incoming request +/// \return SDK_RET_OK if success or error code in case of failure +sdk_ret_t +process_event_subscribe_req (aga_event_subscribe_args_t *req) +{ + gpu_event_record_t event_record = {}; + + for (size_t i = 0; i < req->events.size(); i++) { + AGA_TRACE_DEBUG("Rcvd event {} subscribe request, client {}, " + "client ctxt {}, stream {}", req->events[i], + req->client_ctxt->client.c_str(), + (void *)req->client_ctxt, + (void *)req->client_ctxt->stream); + for (size_t g = 0; g < req->gpu_ids.size(); g++) { + uint32_t d = req->gpu_ids[g]; + auto& event_map = g_gpu_event_db[gpu_get_handle(d)].event_map; + + // lock the event map for this device + SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + // check if this event was of interest to any client or happened + // already + auto event_map_it = event_map.find(req->events[i]); + if (event_map_it == event_map.end()) { + // 1st time anyone is subscribing to this event + event_record.client_info.client_set.insert(req->client_ctxt); + event_map[req->events[i]] = event_record; + } else { + // atleast one client is already interested in this event, check + // if this particular client already subscribed to this event + auto set_it = event_map_it->second.client_info.client_set.find( + req->client_ctxt); + if (set_it == + event_map_it->second.client_info.client_set.end()) { + // this client is a new listener for this event + event_map_it->second.client_info.client_set.insert( + req->client_ctxt); + } else { + // this client is already subscribed to this event + } + } + // unlock the event map for this device + SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + } + } + return SDK_RET_OK; +} + +/// \brief callback function to process IPC msg from gRPC thread +/// to handle event subscription requests +/// \param[in] msg received IPC message +/// \param[in] ctxt opaque context (used when callback was registered) +static void +event_subscribe_ipc_cb (sdk::ipc::ipc_msg_ptr msg, const void *ctxt) +{ + sdk_ret_t ret; + aga_event_subscribe_args_t *req; + + req = *(aga_event_subscribe_args_t **)msg->data(); + if (req == NULL) { + AGA_TRACE_ERR("Ignoring NULL event subscribe request received"); + return; + } + ret = process_event_subscribe_req(req); + sdk::ipc::respond(msg, &ret, sizeof(ret)); +} + +static void +event_monitor_thread_init (void *ctxt) +{ + static sdk::event_thread::timer_t event_monitor_timer; + + // initialize event monitoring state + event_monitor_init(); + // subscribe to all IPC msgs of interest + sdk::ipc::reg_request_handler(AGA_IPC_MSG_ID_EVENT_SUBSCRIBE, + event_subscribe_ipc_cb, NULL); + // start event monitoring timer + sdk::event_thread::timer_init(&event_monitor_timer, event_monitor_timer_cb, + AGA_SMI_EVENT_MONITOR_START_DELAY, + AGA_SMI_EVENT_MONITOR_INTERVAL); + sdk::event_thread::timer_start(&event_monitor_timer); +} + +static void +event_monitor_thread_exit (void *ctxt) +{ + // cleanup the event state + for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) { + SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + g_gpu_event_db[gpu_get_handle(d)].event_map.clear(); + SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock); + } +} + +sdk_ret_t +spawn_event_monitor_thread (void) +{ + g_event_monitor_thread = + sdk::event_thread::event_thread::factory( + "event-monitor", AGA_THREAD_ID_EVENT_MONITOR, + sdk::lib::THREAD_ROLE_CONTROL, 0x0, event_monitor_thread_init, + event_monitor_thread_exit, NULL, // message + sdk::lib::thread::priority_by_role(sdk::lib::THREAD_ROLE_CONTROL), + sdk::lib::thread::sched_policy_by_role(sdk::lib::THREAD_ROLE_CONTROL), + (THREAD_YIELD_ENABLE | THREAD_SYNC_IPC_ENABLE)); + SDK_ASSERT_TRACE_RETURN((g_event_monitor_thread != NULL), SDK_RET_ERR, + "GPU event monitor thread create failure"); + g_event_monitor_thread->start(NULL); + return SDK_RET_OK; +} + +sdk_ret_t +smi_init (aga_api_init_params_t *init_params) +{ + // spawn event monitor thread + spawn_event_monitor_thread(); + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_reset (aga_gpu_handle_t gpu_handle, aga_gpu_reset_type_t reset_type) +{ + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_update (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec, + uint64_t upd_mask) +{ + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_fill_device_topology (aga_gpu_handle_t gpu_handle, + aga_device_topology_info_t *info) +{ + uint32_t gpu_id; + uint32_t cnt = 0; + static std::string name = "GPU"; + + // get linear GPU index from device name + sscanf(info->device.name, "GPU%u", &gpu_id); + for (uint32_t i = 0; i < AGA_MOCK_NUM_GPU; i++) { + if (gpu_handle != gpu_get_handle(i)) { + info->peer_device[cnt].peer_device.type = AGA_DEVICE_TYPE_GPU; + strcpy(info->peer_device[cnt].peer_device.name, + (name + std::to_string(i)).c_str()); + info->peer_device[cnt].num_hops = 1; + info->peer_device[cnt].connection.type = AGA_IO_LINK_TYPE_XGMI; + info->peer_device[cnt].link_weight = 15 + (15 * ((i + gpu_id) % 5)); + info->peer_device[cnt].valid = true; + cnt++; + } + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_get_gpu_partition_id (aga_gpu_handle_t gpu_handle, uint32_t *partition_id) +{ + *partition_id = 0; + return SDK_RET_OK; +} + +sdk_ret_t +smi_get_gpu_virtualization_mode (aga_gpu_handle_t gpu_handle, + aga_gpu_virtualization_mode_t *mode) +{ + *mode = AGA_VIRTUALIZATION_MODE_BAREMETAL; + return SDK_RET_OK; +} + +sdk_ret_t +smi_get_gpu_partition_info (aga_gpu_handle_t gpu_handle, bool *capable, + aga_gpu_compute_partition_type_t *compute_partition, + aga_gpu_memory_partition_type_t *memory_partition) +{ + *capable = true; + *compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_SPX; + *memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NPS1; + return SDK_RET_OK; +} + +sdk_ret_t +smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles, + aga_obj_key_t *gpu_keys) +{ + if (!num_gpus) { + return SDK_RET_ERR; + } + *num_gpus = AGA_MOCK_NUM_GPU; + for (uint32_t i = 0; i < *num_gpus; i++) { + gpu_handles[i] = gpu_get_handle(i); + } + if (gpu_keys) { + for (uint32_t i = 0; i < *num_gpus; i++) { + gpu_keys[i] = gpu_uuid(i, gpu_get_unique_id(i)); + } + } + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_get_bad_page_count (void *gpu_obj, + uint32_t *num_bad_pages) +{ + *num_bad_pages = 1; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_get_bad_page_records (void *gpu_obj, + uint32_t num_bad_pages, + aga_gpu_bad_page_record_t *records) +{ + gpu_entry *gpu = (gpu_entry *)gpu_obj; + + records[0].key = gpu->key(); + records[0].page_address = 0x5c70ec; + records[0].page_size = 4096; + records[0].page_status = AGA_GPU_PAGE_STATUS_UNRESERVABLE; + return SDK_RET_OK; +} + +sdk_ret_t +smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle, + aga_cper_severity_t severity, aga_cper_info_t *info) +{ + uint64_t gpu_key; + std::ostringstream oss; + auto cper_entry = &info->cper_entry[info->num_cper_entry++]; + + gpu_key = (uint64_t)gpu_handle; + oss << (gpu_key % 8) + 1 << ":" << (gpu_key+ 5) % 8 + 1; + cper_entry->record_id = oss.str(); + cper_entry->severity = AGA_CPER_SEVERITY_FATAL; + cper_entry->revision = 256; + + oss.str(""); + oss << std::setfill('0') << "2025-09-" << std::setw(2) << + (gpu_key % 31) + 1 << " 15:00:" << std::setw(2) << (gpu_key % 60) + 1; + cper_entry->timestamp = oss.str(); + cper_entry->notification_type = AGA_CPER_NOTIFICATION_TYPE_MCE; + cper_entry->creator_id = "amdgpu"; + cper_entry->num_af_id = 1; + cper_entry->af_id[0] = 30; + return SDK_RET_OK; +} + +} // namespace aga diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go b/sw/nic/gpuagent/cli/cmd/gpu.go index a3f65e0..9644c83 100644 --- a/sw/nic/gpuagent/cli/cmd/gpu.go +++ b/sw/nic/gpuagent/cli/cmd/gpu.go @@ -1098,6 +1098,20 @@ func printUsageHdr(indent string) { } } +func printViolationAccumulatedHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "GPU GFX clock host limit accumulated:\n") + printHdr = true + } +} + +func printViolationPercentageHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "GPU GFX clock host limit percentage:\n") + printHdr = true + } +} + func printGPUStats(gpu *aga.GPU, statsOnly bool) { var indent string spec := gpu.GetSpec() @@ -1650,6 +1664,7 @@ func printGPUStats(gpu *aga.GPU, statsOnly bool) { } } if stats.GetViolationStats() != nil { + printHdr = false vStats := stats.GetViolationStats() if vStats.GetCurrentAccumulatedCounter() != UINT64_MAX_VAL { fmt.Printf(indent+"%-38s : %d\n", "Current accumulated counter", @@ -1679,6 +1694,168 @@ func printGPUStats(gpu *aga.GPU, statsOnly bool) { "HBM thermal residency accumulated", vStats.GetHBMThermalResidencyAccumulated()) } + if vStats.GetProcessorHotResidencyPercentage() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d%%\n", + "Processor hot residency percentage", + vStats.GetProcessorHotResidencyPercentage()) + } + if vStats.GetPPTResidencyPercentage() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d%%\n", + "PPT residency percentage", + vStats.GetPPTResidencyPercentage()) + } + if vStats.GetSocketThermalResidencyPercentage() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d%%\n", + "Socket thermal residency percentage", + vStats.GetSocketThermalResidencyPercentage()) + } + if vStats.GetVRThermalResidencyPercentage() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d%%\n", + "VR thermal residency percentage", + vStats.GetVRThermalResidencyPercentage()) + } + if vStats.GetHBMThermalResidencyPercentage() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d%%\n", + "HBM thermal residency percentage", + vStats.GetHBMThermalResidencyPercentage()) + } + validEntry := false + gStr := fmt.Sprintf(" %-36s : ", "Power") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitPowerAccumulated() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d ", gStr, gfx) + } + } + if validEntry { + printViolationAccumulatedHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Thermal") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTHMAccumulated() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d ", gStr, gfx) + } + } + if validEntry { + printViolationAccumulatedHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Low Utilization") + for _, gfx := range stats.GetViolationStats().GetGFXLowUtilizationAccumulated() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d ", gStr, gfx) + } + } + if validEntry { + printViolationAccumulatedHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Total") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTotalAccumulated() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d ", gStr, gfx) + } + } + if validEntry { + printViolationAccumulatedHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Power") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitPowerPercentage() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL || gfx > 100 { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + printHdr = false + gStr = fmt.Sprintf("%s%d%% ", gStr, gfx) + } + } + if validEntry { + printViolationPercentageHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Thermal") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTHMPercentage() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL || gfx > 100 { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d%% ", gStr, gfx) + } + } + if validEntry { + printViolationPercentageHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Low Utilization") + for _, gfx := range stats.GetViolationStats().GetGFXLowUtilizationPercentage() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL || gfx > 100 { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d%% ", gStr, gfx) + } + } + if validEntry { + printViolationPercentageHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + + gStr = fmt.Sprintf(" %-36s : ", "Total") + for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTotalPercentage() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT64_MAX_VAL || gfx > 100 { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d%% ", gStr, gfx) + } + } + if validEntry { + printViolationPercentageHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } } fmt.Printf("\n%s\n", strings.Repeat("-", 80)) diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go.orig b/sw/nic/gpuagent/cli/cmd/gpu.go.orig new file mode 100644 index 0000000..a3f65e0 --- /dev/null +++ b/sw/nic/gpuagent/cli/cmd/gpu.go.orig @@ -0,0 +1,2034 @@ +// +// Copyright(C) Advanced Micro Devices, Inc. All rights reserved. +// +// You may not use this software and documentation (if any) (collectively, +// the "Materials") except in compliance with the terms and conditions of +// the Software License Agreement included with the Materials or otherwise as +// set forth in writing and signed by you and an authorized signatory of AMD. +// If you do not have a copy of the Software License Agreement, contact your +// AMD representative for a copy. +// +// You agree that you will not reverse engineer or decompile the Materials, +// in whole or in part, except as allowed by applicable law. +// +// THE MATERIALS ARE DISTRIBUTED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR +// REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +// + +//------------------------------------------------------------------------------ +/// +/// \file +/// gpctl command line interface for gpu protobufs +/// +//------------------------------------------------------------------------------ + +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "strconv" + "strings" + + uuid "github.com/satori/go.uuid" + "github.com/spf13/cobra" + yaml "gopkg.in/yaml.v2" + + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" +) + +var ( + gpuID string + gpuAdminState string + overDriveLevel uint32 + powerCap uint64 + perfLevel string + gpuClkFreq string + memClkFreq string + fanSpeed uint64 + gpuAdminStateVal aga.GPUAdminState + PerformanceLevelVal aga.GPUPerformanceLevel + clockType aga.GPUClockType + memPartition string + memPartitionVal aga.GPUMemoryPartitionType + computePartition string + computePartitionVal aga.GPUComputePartitionType + gpuClkType string + gpuClkFreqLo uint32 + gpuClkFreqHi uint32 + memClkFreqLo uint32 + memClkFreqHi uint32 + printHdr bool + severity string +) + +const ( + UINT16_MAX_VAL_UINT16 uint16 = 0xffff + UINT16_MAX_VAL_UINT32 uint32 = 0xffff + UINT16_MAX_VAL_UINT64 uint64 = 0xffff + UINT32_MAX_VAL_UINT32 uint32 = 0xffffffff + UINT32_MAX_VAL_UINT64 uint64 = 0xffffffff + UINT64_MAX_VAL uint64 = 0xffffffffffffffff + FLOAT32_INVALID_VAL float32 = 65535.0 +) + +var gpuShowCmd = &cobra.Command{ + Use: "gpu", + Short: "show GPU information", + Long: "show GPU information", + RunE: gpuShowCmdHandler, +} + +var gpuAllShowCmd = &cobra.Command{ + Use: "all", + Short: "show all GPU object", + Long: "show all GPU object", + RunE: gpuAllShowCmdHandler, +} + +var gpuPartitionsShowCmd = &cobra.Command{ + Use: "compute-partition", + Short: "show physical GPU's compute partitions", + Long: "show physical GPU's compute partitions", + RunE: gpuPartitionsShowCmdHandler, +} + +var gpuBadPageShowCmd = &cobra.Command{ + Use: "bad-page", + Short: "show GPU bad page information", + Long: "show GPU bad page information", + RunE: gpuBadPageShowCmdHandler, +} + +var gpuCPERShowCmd = &cobra.Command{ + Use: "cper-records", + Short: "show GPU CPER records", + Long: "show GPU CPER information", + RunE: gpuCPERShowCmdHandler, +} + +var gpuStatsShowCmd = &cobra.Command{ + Use: "statistics", + Short: "show GPU statistics", + Long: "show GPU statistics", + RunE: gpuStatsShowCmdHandler, +} + +var gpuUpdateCmd = &cobra.Command{ + Use: "gpu", + Short: "update gpu object", + Long: "update gpu object", + PreRunE: gpuUpdateCmdPreRunE, + RunE: gpuUpdateCmdHandler, +} + +var gpuResetCmd = &cobra.Command{ + Use: "reset", + Short: "reset gpu object/settings", + Long: "reset gpu object/settings", + PreRunE: gpuResetCmdPreRunE, + RunE: gpuResetCmdHandler, +} + +func init() { + ShowCmd.AddCommand(gpuShowCmd) + gpuShowCmd.Flags().BoolP("yaml", "y", false, "Output in yaml") + gpuShowCmd.Flags().BoolP("json", "j", false, "Output in json") + gpuShowCmd.Flags().BoolP("status", "s", false, "Show GPU status") + gpuShowCmd.Flags().Bool("summary", false, "Display number of objects") + gpuShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + gpuShowCmd.Flags().BoolP("partitioned", "p", false, + "Show only partitioned GPUs") + + gpuShowCmd.AddCommand(gpuAllShowCmd) + gpuAllShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + + gpuShowCmd.AddCommand(gpuStatsShowCmd) + gpuStatsShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + + gpuShowCmd.AddCommand(gpuPartitionsShowCmd) + gpuPartitionsShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify partitioned GPU's id") + gpuPartitionsShowCmd.Flags().BoolP("yaml", "y", false, "Output in yaml") + gpuPartitionsShowCmd.Flags().BoolP("json", "j", false, "Output in json") + + gpuShowCmd.AddCommand(gpuBadPageShowCmd) + gpuBadPageShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", + "Specify GPU id") + + gpuShowCmd.AddCommand(gpuCPERShowCmd) + gpuCPERShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", + "Specify GPU id") + gpuCPERShowCmd.Flags().StringVarP(&severity, "severity", "s", "all", + "Specify CPER severity (\"fatal\", \"non-fatal-uncorrected\", "+ + "\"non-fatal-corrected\" or \"all\")") + gpuCPERShowCmd.Flags().BoolP("json", "j", false, "Output in json") + + DebugUpdateCmd.AddCommand(gpuUpdateCmd) + gpuUpdateCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + gpuUpdateCmd.Flags().StringVarP(&gpuAdminState, "admin-state", "a", "", + "Specify admin state (up/down)") + gpuUpdateCmd.Flags().Uint32VarP(&overDriveLevel, "overdrive-level", "o", 0, + "Specify GPU clock overdrive level in percentage") + gpuUpdateCmd.Flags().Uint64VarP(&powerCap, "power-cap", "p", 0, + "Specify max package power GPU can consume (in Watts)") + gpuUpdateCmd.Flags().StringVarP(&perfLevel, "perf-level", "l", "", + "Specify GPU performance level (none/auto/low/high/deterministic/"+ + "memclock/sysclock/manual)") + gpuUpdateCmd.Flags().StringVarP(&gpuClkType, "clock-type", "t", "", + "Specify GPU clock type (memory, system, video or data)") + gpuUpdateCmd.Flags().StringVarP(&gpuClkFreq, "clock-frequency", "c", "", + "Specify GPU clock frequency range (lo-hi)") + gpuUpdateCmd.Flags().StringVarP(&memPartition, "memory-partition", "m", "", + "Specify GPU memory partition type (NPS1, NPS2, NPS4, NPS8)") + gpuUpdateCmd.Flags().StringVarP(&computePartition, "compute-partition", "", + "", "Specify GPU compute partition type (SPX, DPX, TPX, QPX, CPX)") + gpuUpdateCmd.Flags().Uint64VarP(&fanSpeed, "fan-speed", "s", 0, + "Specify fan speed") + gpuUpdateCmd.MarkFlagRequired("id") + // TODO: RAS spec + + gpuUpdateCmd.AddCommand(gpuResetCmd) + gpuResetCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id") + gpuResetCmd.Flags().Bool("clocks", false, + "Reset clocks and overdrive to default") + gpuResetCmd.Flags().Bool("fans", false, "Reset fans to automatic control") + gpuResetCmd.Flags().Bool("power-profile", false, + "Reset power profile to default") + gpuResetCmd.Flags().Bool("power-overdrive", false, + "Set the maximum GPU power back to the device deafult state") + gpuResetCmd.Flags().Bool("xgmi-error", false, + "Reset XGMI error status/count") + gpuResetCmd.Flags().Bool("perf-determinism", false, + "Disable performance determinism") + gpuResetCmd.Flags().Bool("compute-partition", false, + "Resets to boot compute partition state") + gpuResetCmd.Flags().Bool("nps-mode", false, "Reset to boot NPS mode state") + gpuResetCmd.MarkFlagRequired("id") +} + +func printGPUPartitions(resp *aga.GPUComputePartition) { + fmt.Printf("%-40s%-16s", utils.IdToStr(resp.GetId()), + strings.Replace(resp.GetPartitionType().String(), + "GPU_COMPUTE_PARTITION_TYPE_", "", -1)) + + for i, partition := range resp.GPUPartition { + if i != 0 { + fmt.Printf("%-56s%-40s\n", "", utils.IdToStr(partition)) + } else { + fmt.Printf("%-40s\n", "", utils.IdToStr(partition)) + } + } +} + +type ShadowGPUComputePartition struct { + Id string + PartitionType aga.GPUComputePartitionType + GPUPartition []string +} + +func NewGPUComputePartition(resp *aga.GPUComputePartition) *ShadowGPUComputePartition { + var gpuPartitions []string + for _, child := range resp.GetGPUPartition() { + gpuPartitions = append(gpuPartitions, utils.IdToStr(child)) + } + return &ShadowGPUComputePartition{ + Id: utils.IdToStr(resp.GetId()), + PartitionType: resp.GetPartitionType(), + GPUPartition: gpuPartitions, + } +} + +func printGPUPartitionsJson(resp *aga.GPUComputePartition) { + partition := NewGPUComputePartition(resp) + b, _ := json.MarshalIndent(partition, " ", " ") + fmt.Printf(" %s", string(b)) +} + +func gpuPartitionsShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUComputePartitionGetResponse{} + var req *aga.GPUComputePartitionGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUComputePartitionGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUComputePartitionGetRequest{ + Id: [][]byte{}, + } + } + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUComputePartitionGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + // print GPUs + if cmd != nil && cmd.Flags().Changed("yaml") { + yamlArr, _ := yaml.Marshal(respMsg.Response) + fmt.Println(string(yamlArr)) + } else if cmd != nil && cmd.Flags().Changed("json") { + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("[\n") + } + rcvdResp := false + for _, resp := range respMsg.Response { + if rcvdResp == true { + // json output requires a , after each GPU + fmt.Printf(",\n") + } + printGPUPartitionsJson(resp) + rcvdResp = true + } + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("\n]\n") + } + } else { + hdrLine := strings.Repeat("-", 96) + fmt.Println(hdrLine) + fmt.Printf("%-40s%-16s%-40s\n", "PhysicalGPU", "PartitionType", + "GPUPartitions") + fmt.Println(hdrLine) + for _, resp := range respMsg.Response { + printGPUPartitions(resp) + } + } + return nil +} + +type ShadowGPUCPEREntry struct { + GPU string + CPEREntry []*aga.CPEREntry +} + +func NewCPER(cper *aga.GPUCPEREntry) *ShadowGPUCPEREntry { + return &ShadowGPUCPEREntry{ + GPU: utils.IdToStr(cper.GetGPU()), + CPEREntry: cper.GetCPEREntry(), + } +} + +func printGPUCPEREntryJson(cper *aga.GPUCPEREntry) { + b, _ := json.MarshalIndent(NewCPER(cper), " ", " ") + fmt.Printf(" %s", string(b)) +} + +func gpuCPERShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUCPERGetResponse{} + var req *aga.GPUCPERGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUCPERGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUCPERGetRequest{ + Id: [][]byte{}, + } + } + switch strings.ToLower(severity) { + case "all": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NONE + case "non-fatal-uncorrected": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_UNCORRECTED + case "fatal": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_FATAL + case "non-fatal-corrected": + req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_CORRECTED + default: + return fmt.Errorf("Invalid value specified for \"--severity\"") + } + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUCPERGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU CPER failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + // print CPER information + if cmd != nil && cmd.Flags().Changed("json") { + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("[\n") + } + rcvdResp := false + for _, cper := range respMsg.CPER { + if rcvdResp == true { + // json output requires a , after each GPU + fmt.Printf(",\n") + } + printGPUCPEREntryJson(cper) + rcvdResp = true + } + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("\n]\n") + } + } else { + hdrLine := strings.Repeat("-", 156) + fmt.Println(hdrLine) + fmt.Printf("%-20s%-40s%-16s%-25s%-10s%-10s%-15s%-20s\n", + "Timestamp", "GPU", "RecordId", "Severity", "Revision", "CreatorId", + "NtfnType", "AMDFieldId") + fmt.Println(hdrLine) + for _, cper := range respMsg.CPER { + gpuStr := utils.IdToStr(cper.GetGPU()) + for _, entry := range cper.GetCPEREntry() { + severityStr := strings.Replace(entry.GetSeverity().String(), + "CPER_SEVERITY_", "", -1) + ntfnTypeStr := + strings.Replace(entry.GetNotificationType().String(), + "CPER_NOTIFICATION_TYPE_", "", -1) + ntfnTypeStr = strings.Replace(ntfnTypeStr, "_", "-", -1) + + var afIdBuilder strings.Builder + indent := strings.Repeat(" ", 121) + for i, afId := range entry.GetAFId() { + afIdBuilder.WriteString(strconv.FormatUint(afId, 10)) + if (i+1)%3 == 0 { + if i != len(entry.GetAFId())-1 { + afIdBuilder.WriteString("\n" + indent) + } + } else if i != len(entry.GetAFId())-1 { + afIdBuilder.WriteString(", ") + } + } + afIdStr := afIdBuilder.String() + fmt.Printf("%-20s%-40s%-16s%-25s%-10d%-10s%-15s%-20s\n", + entry.GetTimestamp(), gpuStr, entry.GetRecordId(), + severityStr, entry.GetRevision(), entry.GetCreatorId(), + ntfnTypeStr, afIdStr) + } + } + } + return nil +} +func gpuShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUGetResponse{} + var req *aga.GPUGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUGetRequest{ + Id: [][]byte{}, + } + } + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + var response []*aga.GPU + for _, resp := range respMsg.Response { + status := resp.GetStatus() + if len(status.GetGPUPartition()) > 0 { + if cmd != nil && cmd.Flags().Changed("partitioned") { + response = append(response, resp) + } + } else { + if cmd == nil || !cmd.Flags().Changed("partitioned") { + response = append(response, resp) + } + } + } + + // print GPUs + if cmd != nil && cmd.Flags().Changed("yaml") { + yamlArr, _ := yaml.Marshal(response) + fmt.Println(string(yamlArr)) + } else if cmd != nil && cmd.Flags().Changed("json") { + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("[\n") + } + rcvdResp := false + for _, resp := range response { + if rcvdResp == true { + // json output requires a , after each GPU + fmt.Printf(",\n") + } + printGPUJson(resp) + rcvdResp = true + } + // json output requires that all GPUs are listed within [] braces + if cmd.Flags().Changed("json") { + fmt.Printf("\n]\n") + } + } else if cmd != nil && cmd.Flags().Changed("summary") { + printGPUSummary(len(response)) + } else if cmd != nil && cmd.Flags().Changed("status") { + for _, resp := range response { + printGPUStatus(resp, true) + } + printGPUSummary(len(response)) + } else { + for _, resp := range response { + printGPUSpec(resp, true) + } + printGPUSummary(len(response)) + } + return nil +} + +func printGPUBadPageHeader() { + hdrLine := strings.Repeat("-", 80) + fmt.Println(hdrLine) + fmt.Printf("%-40s%-16s%-12s%-12s\n", + "GPU", "PageAddress", "PageSize", "Status") + fmt.Println(hdrLine) +} + +func gpuBadPageShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + var req *aga.GPUBadPageGetRequest + var rsp *aga.GPUBadPageGetResponse + if cmd != nil && cmd.Flags().Changed("id") { + // get for specific GPU + req = &aga.GPUBadPageGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get for all GPUs + req = &aga.GPUBadPageGetRequest{ + Id: [][]byte{}, + } + } + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent running?") + } + defer c.Close() + defer cancel() + + client := aga.NewDebugGPUSvcClient(c) + stream, err := client.GPUBadPageGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting mapping failed, err %v", err) + } + firstResp := true + currGPU := "" + for { + rsp, err = stream.Recv() + if err == io.EOF { + break + } + if err != nil { + return fmt.Errorf("GPU bad page get failure, err %v\n", err) + } + if rsp.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", rsp.ApiStatus) + } + if firstResp == true { + printGPUBadPageHeader() + firstResp = false + } + // print GPU bad pages + for _, record := range rsp.Record { + if currGPU == utils.IdToStr(record.GetGPU()) { + fmt.Printf("%-40s%-16x%-12d%-12s\n", "", + record.GetPageAddress(), record.GetPageSize(), + strings.ToLower(strings.Replace( + record.GetPageStatus().String(), + "GPU_PAGE_STATUS_", "", -1))) + } else { + currGPU = utils.IdToStr(record.GetGPU()) + fmt.Printf("%-40s%-16x%-12d%-12s\n", currGPU, + record.GetPageAddress(), record.GetPageSize(), + strings.ToLower(strings.Replace( + record.GetPageStatus().String(), + "GPU_PAGE_STATUS_", "", -1))) + } + } + } + return nil +} + +func gpuStatsShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUGetResponse{} + var req *aga.GPUGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUGetRequest{ + Id: [][]byte{}, + } + } + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + // print GPUs + for _, resp := range respMsg.Response { + printGPUStats(resp, true) + } + printGPUSummary(len(respMsg.Response)) + return nil +} + +func gpuAllShowCmdHandler(cmd *cobra.Command, args []string) error { + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + if cmd != nil { + if cmd.Flags().Changed("id") { + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + } + cmd.SilenceUsage = true + } + respMsg := &aga.GPUGetResponse{} + var req *aga.GPUGetRequest + if cmd != nil && cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } else { + // get all GPUs + req = &aga.GPUGetRequest{ + Id: [][]byte{}, + } + } + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU failed, err %v", err) + } + + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus) + } + + // print GPUs + for _, resp := range respMsg.Response { + printGPUSpec(resp, false) + printGPUStatus(resp, false) + printGPUStats(resp, false) + } + printGPUSummary(len(respMsg.Response)) + return nil +} + +func printGPUSummary(count int) { + fmt.Printf("\nNo. of gpus : %d\n\n", count) +} + +func printGPUSpec(gpu *aga.GPU, specOnly bool) { + spec := gpu.GetSpec() + status := gpu.GetStatus() + + fmt.Printf("%-40s : %s (%d)\n", "Id", utils.IdToStr(spec.GetId()), + status.GetIndex()) + if spec.GetAdminState() != aga.GPUAdminState_GPU_ADMIN_STATE_NONE { + fmt.Printf("%-40s : %s\n", "Admin state", + strings.ToLower(strings.Replace(spec.GetAdminState().String(), + "GPU_ADMIN_STATE_", "", -1))) + } + if spec.GetOverDriveLevel() != UINT32_MAX_VAL_UINT32 { + fmt.Printf("%-40s : %v\n", "Clock overdrive level", + spec.GetOverDriveLevel()) + } + if spec.GetGPUPowerCap() != 0 { + fmt.Printf("%-40s : %d\n", "Power overdrive (in watts)", + spec.GetGPUPowerCap()) + } + if spec.GetPerformanceLevel() != + aga.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE { + fmt.Printf("%-40s : %s\n", "Performance level", + strings.ToLower(strings.Replace(spec.GetPerformanceLevel().String(), + "GPU_PERF_LEVEL_", "", -1))) + } + for _, clockFreq := range spec.GetClockFrequency() { + if clockFreq.GetLowFrequency() != UINT32_MAX_VAL_UINT32 && + clockFreq.GetHighFrequency() != UINT32_MAX_VAL_UINT32 { + fmt.Printf("%-40s : %s\n", "GPU clock type", + strings.Replace(clockFreq.GetClockType().String(), + "GPU_CLOCK_TYPE_", "", -1)) + fmt.Printf(" %-38s : %d - %d\n", + "Frequency range (in MHz)", + clockFreq.GetLowFrequency(), + clockFreq.GetHighFrequency()) + } + } + if spec.GetFanSpeed() != UINT64_MAX_VAL { + fmt.Printf("%-40s : %v\n", "Fan speed", spec.GetFanSpeed()) + } + if spec.GetComputePartitionType() != + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_NONE { + fmt.Printf("%-40s : %s\n", "Compute partition type", + strings.Replace(spec.GetComputePartitionType().String(), + "GPU_COMPUTE_PARTITION_TYPE_", "", -1)) + } + if spec.GetMemoryPartitionType() != + aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NONE { + fmt.Printf("%-40s : %s\n", "Memory partition type", + strings.Replace(spec.GetMemoryPartitionType().String(), + "GPU_MEMORY_PARTITION_TYPE_", "", -1)) + } + // TODO: fill GPU RAS Spec + if specOnly { + fmt.Printf("\n%s\n", strings.Repeat("-", 80)) + } +} + +func printPCIeStatusHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "PCIe status : \n") + printHdr = true + } +} + +func printVRAMStatusHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "VRAM status:\n") + printHdr = true + } +} + +func printGPUStatus(gpu *aga.GPU, statusOnly bool) { + var indent string + spec := gpu.GetSpec() + status := gpu.GetStatus() + + if statusOnly { + fmt.Printf("\n%-38s : %s (%d)\n", "GPU id", utils.IdToStr(spec.GetId()), + status.GetIndex()) + indent = "" + } else { + fmt.Printf("\nStatus :\n") + indent = " " + } + fmt.Printf(indent+"%-38s : %d\n", "Index", status.GetIndex()) + fmt.Printf(indent+"%-38s : %d\n", "KFD id", status.GetKFDId()) + fmt.Printf(indent+"%-38s : %d\n", "DRM render id", status.GetDRMRenderId()) + fmt.Printf(indent+"%-38s : %d\n", "DRM card id", status.GetDRMCardId()) + fmt.Printf(indent+"%-38s : %s\n", "Virtualization mode", + strings.ToLower(strings.Replace(status.GetVirtualizationMode().String(), + "GPU_VIRTUALIZATION_MODE_", "", -1))) + fmt.Printf(indent+"%-38s : 0x%x\n", "GPU handle", status.GetGPUHandle()) + if status.GetSerialNum() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Serial number", + status.GetSerialNum()) + } + if status.GetCardSeries() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Card series", status.GetCardSeries()) + } + if status.GetCardModel() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Card model", status.GetCardModel()) + } + if status.GetCardVendor() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Card vendor", status.GetCardVendor()) + } + if status.GetCardSKU() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Card SKU", status.GetCardSKU()) + } + if status.GetDriverVersion() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Driver version", + status.GetDriverVersion()) + } + if status.GetVBIOSPartNumber() != "" { + fmt.Printf(indent+"%-38s : %s\n", "VBIOS part number", + status.GetVBIOSPartNumber()) + } + if status.GetVBIOSVersion() != "" { + fmt.Printf(indent+"%-38s : %s\n", "VBIOS version", + status.GetVBIOSVersion()) + } + switch spec.GetComputePartitionType() { + case aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_NONE: + break + default: + fmt.Printf(indent+"%-38s : %d\n", "Partition Id", + status.GetPartitionId()) + } + fwVer := status.GetFirmwareVersion() + if len(fwVer) != 0 { + fmt.Printf(indent + "Firmware versions:\n") + for i := 0; i < len(fwVer); i++ { + fwVerStr := fmt.Sprintf("%s %s", fwVer[i].GetFirmware(), + "firmware version") + fmt.Printf(indent+" %-36s : %s\n", fwVerStr, fwVer[i].GetVersion()) + } + } + if status.GetMemoryVendor() != "" { + fmt.Printf(indent+"%-38s : %s\n", "Memory vendor", + status.GetMemoryVendor()) + } + if status.GetOperStatus() != aga.GPUOperStatus_GPU_OPER_STATUS_NONE { + fmt.Printf(indent+"%-38s : %s\n", "Operational status", + strings.ToLower(strings.Replace(status.GetOperStatus().String(), + "GPU_OPER_STATUS_", "", -1))) + } + clkStr := "" + idxr := 0 + for _, clkStatus := range status.GetClockStatus() { + curClkStr := strings.Replace(clkStatus.GetType().String(), + "GPU_CLOCK_TYPE_", "", -1) + if clkStr != curClkStr { + clkStr = curClkStr + idxr = 0 + } + if clkStatus.GetType() != aga.GPUClockType_GPU_CLOCK_TYPE_NONE && + clkStatus.GetFrequency() != 0 && + clkStatus.GetFrequency() != UINT16_MAX_VAL_UINT32 && + clkStatus.GetFrequency() != UINT32_MAX_VAL_UINT32 && + clkStatus.GetLowFrequency() != UINT32_MAX_VAL_UINT32 && + clkStatus.GetHighFrequency() != UINT32_MAX_VAL_UINT32 { + fmt.Printf(indent+"%-38s : %s_%d\n", "GPU clock type", clkStr, idxr) + fmt.Printf(indent+" %-36s : %d\n", "Frequency (in MHz)", + clkStatus.GetFrequency()) + fmt.Printf(indent+" %-36s : %d - %d\n", "Frequency range (in MHz)", + clkStatus.GetLowFrequency(), clkStatus.GetHighFrequency()) + if clkStatus.GetLocked() { + fmt.Printf(indent+" %-36s : true\n", "Clock locked") + } + if clkStatus.GetDeepSleep() { + fmt.Printf(indent+" %-36s : true\n", "Deep sleep enabled") + } + } + idxr++ + } + kfdPids := status.GetKFDProcessId() + if len(kfdPids) != 0 { + kfdPidStr := fmt.Sprintf("%-38s : ", "KFD process id using GPU") + for i := 0; i < len(kfdPids); i++ { + fmt.Printf(indent+"%-41s%d\n", kfdPidStr, kfdPids[i]) + kfdPidStr = "" + } + } + // TODO: fill GPU RAS status + xgmiStatus := status.GetXGMIStatus() + if xgmiStatus.GetErrorStatus() != + aga.GPUXGMIErrorStatus_GPU_XGMI_STATUS_NONE { + fmt.Printf(indent+"%-38s : %s\n", "XGMI error status", + strings.ToLower(strings.Replace( + xgmiStatus.GetErrorStatus().String(), "GPU_XGMI_STATUS_", + "", -1))) + } + if xgmiStatus.GetWidth() != 0 && + xgmiStatus.GetWidth() != UINT16_MAX_VAL_UINT64 && + xgmiStatus.GetWidth() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %v\n", "XGMI link width (in GB/s)", + xgmiStatus.GetWidth()) + } + if xgmiStatus.GetSpeed() != 0 && + xgmiStatus.GetSpeed() != UINT16_MAX_VAL_UINT64 && + xgmiStatus.GetSpeed() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %v\n", "XGMI link speed (in GB/s)", + xgmiStatus.GetSpeed()) + } + if status.GetThrottlingStatus() != + aga.GPUThrottlingStatus_GPU_THROTTLING_STATUS_NONE { + fmt.Printf(indent+"%-38s : %s\n", "GPU throttling", + strings.ToLower(strings.Replace( + status.GetThrottlingStatus().String(), "GPU_THROTTLING_STATUS_", + "", -1))) + } + if (status.GetFWTimestamp() != 0) && + (status.GetFWTimestamp() != UINT64_MAX_VAL) { + fmt.Printf(indent+"%-38s : %v\n", "FW timestamp (in ns)", + status.GetFWTimestamp()) + } + /* commenting voltage-curve-point display for time being until it is added + back to status proto + vcp := status.GetVoltageCurvePoint() + if len(vcp) != 0 { + valid_vc := false + for i := 0; i < len(vcp); i++ { + if vcp[i].GetFrequency() != 0 || vcp[i].GetVoltage() != 0 { + valid_vc = true + } + } + if valid_vc { + fmt.Printf(indent+"Voltage curve points:\n") + for i := 0; i < len(vcp); i++ { + if vcp[i].GetFrequency() != 0 || vcp[i].GetVoltage() != 0 { + fmt.Printf(indent+" %-36s : %d\n", "Curve point", + vcp[i].GetPoint()) + fmt.Printf(indent+" %-34s : %d\n", + "Frequency (in MHz)", + vcp[i].GetFrequency()) + fmt.Printf(indent+" %-34s : %d\n", "Voltage (in mV)", + vcp[i].GetVoltage()) + } + } + } + } + */ + if status.GetPCIeStatus() != nil { + printHdr = false + pcie := status.GetPCIeStatus() + if pcie.GetVersion() != 0 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Version", pcie.GetVersion()) + } + if pcie.GetSlotType() != aga.PCIeSlotType_PCIE_SLOT_TYPE_NONE { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %s\n", "Card form factor", + strings.ToLower(strings.Replace(pcie.GetSlotType().String(), + "PCIE_SLOT_TYPE_", "", -1))) + } + if pcie.GetPCIeBusId() != "" { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %s\n", "Bus id", pcie.GetPCIeBusId()) + } + if pcie.GetWidth() != 0 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Current number of lanes", + pcie.GetWidth()) + } + if pcie.GetMaxWidth() != 0 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Maximum number of lanes", + pcie.GetMaxWidth()) + } + if pcie.GetSpeed() != 0 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Current speed (in GT/s)", + pcie.GetSpeed()) + } + if pcie.GetMaxSpeed() != 0 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Maximum speed (in GT/s)", + pcie.GetMaxSpeed()) + } + if pcie.GetBandwidth() != 0 && + pcie.GetBandwidth() != UINT32_MAX_VAL_UINT64 { + printPCIeStatusHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Current bandwidth (in MB/s)", + pcie.GetBandwidth()) + } + } + if status.GetVRAMStatus() != nil { + printHdr = false + vram := status.GetVRAMStatus() + if vram.GetType() != aga.VRAMType_VRAM_TYPE_NONE { + printVRAMStatusHdr(indent) + fmt.Printf(indent+" %-36s : %s\n", "VRAM type", + strings.ToLower(strings.Replace(vram.GetType().String(), + "VRAM_TYPE_", "", -1))) + } + if vram.GetVendor() != "" { + printVRAMStatusHdr(indent) + fmt.Printf(indent+" %-36s : %s\n", "VRAM vendor", + strings.ToLower(vram.GetVendor())) + } else { + fmt.Printf(indent+" %-36s : %s\n", "VRAM vendor", "-") + } + if vram.GetSize_() != 0 { + printVRAMStatusHdr(indent) + fmt.Printf(indent+" %-36s : %v\n", "VRAM size (in MB)", + vram.GetSize_()) + } + } + if statusOnly { + fmt.Printf("\n%s\n", strings.Repeat("-", 80)) + } +} + +func printVRAMUsageHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "VRAM usage:\n") + printHdr = true + } +} + +func printVoltageHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "GPU voltage statistics:\n") + printHdr = true + } +} + +func printTemperatureHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "Temperature information:\n") + printHdr = true + } +} + +func printPCIeHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "PCIe statistics:\n") + printHdr = true + } +} + +func printUsageHdr(indent string) { + if printHdr == false { + fmt.Printf(indent + "Current GPU usage:\n") + printHdr = true + } +} + +func printGPUStats(gpu *aga.GPU, statsOnly bool) { + var indent string + spec := gpu.GetSpec() + stats := gpu.GetStats() + status := gpu.GetStatus() + + if statsOnly { + fmt.Printf("\n%-38s : %s (%d)\n", "GPU id", utils.IdToStr(spec.GetId()), + status.GetIndex()) + indent = "" + } else { + fmt.Printf("\nStatistics :\n") + indent = " " + } + if stats.GetPackagePower() != 0 && + stats.GetPackagePower() != UINT16_MAX_VAL_UINT64 { + fmt.Printf(indent+"%-38s : %d\n", "Current graphics power (in Watts)", + stats.GetPackagePower()) + } + if stats.GetAvgPackagePower() != 0 && + stats.GetAvgPackagePower() != UINT16_MAX_VAL_UINT64 { + fmt.Printf(indent+"%-38s : %d\n", "Average graphics power (in Watts)", + stats.GetAvgPackagePower()) + } + if stats.GetTemperature() != nil { + printHdr = false + if stats.GetTemperature().GetEdgeTemperature() != 0 && + stats.GetTemperature().GetEdgeTemperature() != FLOAT32_INVALID_VAL { + printTemperatureHdr(indent) + fmt.Printf(indent+" %-36s : %.1f\n", "Edge temperature (in C)", + stats.GetTemperature().GetEdgeTemperature()) + } + if stats.GetTemperature().GetJunctionTemperature() != 0 && + stats.GetTemperature().GetJunctionTemperature() != + FLOAT32_INVALID_VAL { + printTemperatureHdr(indent) + fmt.Printf(indent+" %-36s : %.1f\n", "Junction temperature (in C)", + stats.GetTemperature().GetJunctionTemperature()) + } + if stats.GetTemperature().GetMemoryTemperature() != 0 && + stats.GetTemperature().GetMemoryTemperature() != + FLOAT32_INVALID_VAL { + printTemperatureHdr(indent) + fmt.Printf(indent+" %-36s : %.1f\n", "VRAM temperature (in C)", + stats.GetTemperature().GetMemoryTemperature()) + } + hbmTemp := stats.GetTemperature().GetHBMTemperature() + for index, temp := range hbmTemp { + if temp != 0 && temp != FLOAT32_INVALID_VAL { + printTemperatureHdr(indent) + hbmStr := "HBM " + strconv.Itoa(index) + " temperature (in C)" + fmt.Printf(indent+" %-36s : %.1f\n", hbmStr, temp) + } + } + } + if stats.GetUsage() != nil { + printHdr = false + if stats.GetUsage().GetGFXActivity() != 0 && + stats.GetUsage().GetGFXActivity() != UINT32_MAX_VAL_UINT32 && + stats.GetUsage().GetGFXActivity() <= 100 { + printUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "GFX activity", + stats.GetUsage().GetGFXActivity()) + } + if stats.GetUsage().GetUMCActivity() != 0 && + stats.GetUsage().GetUMCActivity() != UINT16_MAX_VAL_UINT32 && + stats.GetUsage().GetUMCActivity() <= 100 { + printUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "UMC activity", + stats.GetUsage().GetUMCActivity()) + } + if stats.GetUsage().GetMMActivity() != 0 && + stats.GetUsage().GetMMActivity() != UINT16_MAX_VAL_UINT32 && + stats.GetUsage().GetMMActivity() <= 100 { + printUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "MM activity", + stats.GetUsage().GetMMActivity()) + } + vStr := fmt.Sprintf(" %-36s : ", "VCN activity") + // used to decide if vcn activity should be printed or not + validEntry := false + for _, vcn := range stats.GetUsage().GetVCNActivity() { + // only if at least one of the vcn activities is a valid value do we + // print the field + if vcn == UINT16_MAX_VAL_UINT32 || vcn > 100 { + vStr = fmt.Sprintf("%sN/A ", vStr) + } else { + validEntry = true + vStr = fmt.Sprintf("%s%d%% ", vStr, vcn) + } + } + if validEntry { + printUsageHdr(indent) + fmt.Printf(indent+"%s\n", vStr) + validEntry = false + } + jStr := fmt.Sprintf(" %-36s : ", "JPEG activity") + for i, jpeg := range stats.GetUsage().GetJPEGActivity() { + // only if at least one of the jpeg activities is a valid value do + // we print the field + if jpeg == UINT16_MAX_VAL_UINT32 || jpeg > 100 { + jStr = fmt.Sprintf("%sN/A ", jStr) + } else { + validEntry = true + jStr = fmt.Sprintf("%s%d%% ", jStr, jpeg) + } + if (i+1)%8 == 0 { + jStr = fmt.Sprintf("%s\n%s%-41s", jStr, indent, "") + } + } + if validEntry { + printUsageHdr(indent) + fmt.Printf(indent+"%s\n", jStr) + validEntry = false + } + gStr := fmt.Sprintf(" %-36s : ", "GFX utilization") + for _, gfx := range stats.GetUsage().GetGFXBusyInst() { + // only if at least one of the gfx busy value is a valid value do we + // print the field + if gfx == UINT16_MAX_VAL_UINT32 || gfx > 100 { + gStr = fmt.Sprintf("%sN/A ", gStr) + } else { + validEntry = true + gStr = fmt.Sprintf("%s%d%% ", gStr, gfx) + } + } + if validEntry { + printUsageHdr(indent) + fmt.Printf(indent+"%s\n", gStr) + validEntry = false + } + vStr = fmt.Sprintf(" %-36s : ", "VCN utilization") + for _, vcn := range stats.GetUsage().GetVCNBusyInst() { + // only if at least one of the vcn busy value is a valid value do we + // print the field + if vcn == UINT16_MAX_VAL_UINT32 || vcn > 100 { + vStr = fmt.Sprintf("%sN/A ", vStr) + } else { + validEntry = true + vStr = fmt.Sprintf("%s%d%% ", vStr, vcn) + } + } + if validEntry { + printUsageHdr(indent) + fmt.Printf(indent+"%s\n", vStr) + validEntry = false + } + jStr = fmt.Sprintf(" %-36s : ", "JPEG utilization") + for i, jpeg := range stats.GetUsage().GetJPEGBusyInst() { + // only if at least one of the jpeg busy value is a valid value do + // we print the field + if jpeg == UINT16_MAX_VAL_UINT32 || jpeg > 100 { + jStr = fmt.Sprintf("%sN/A ", jStr) + } else { + validEntry = true + jStr = fmt.Sprintf("%s%d%% ", jStr, jpeg) + } + if (i+1)%8 == 0 { + jStr = fmt.Sprintf("%s\n%s%-41s", jStr, indent, "") + } + } + if validEntry { + printUsageHdr(indent) + fmt.Printf(indent+"%s\n", jStr) + validEntry = false + } + } + if stats.GetVoltage() != nil { + v := stats.GetVoltage() + printHdr = false + if v.GetVoltage() != 0 && v.GetVoltage() != UINT16_MAX_VAL_UINT64 { + printVoltageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Current voltage (in mV)", + v.GetVoltage()) + } + if v.GetGFXVoltage() != 0 && + v.GetGFXVoltage() != UINT16_MAX_VAL_UINT64 { + printVoltageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", + "Current graphics voltage(in mV)", v.GetGFXVoltage()) + } + if v.GetMemoryVoltage() != 0 && + v.GetMemoryVoltage() != UINT16_MAX_VAL_UINT64 { + printVoltageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Current memory voltage(in mV)", + v.GetMemoryVoltage()) + } + } + if stats.GetPCIeStats() != nil { + printHdr = false + p := stats.GetPCIeStats() + if p.GetReplayCount() != 0 && p.GetReplayCount() != UINT64_MAX_VAL { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Replay count", + p.GetReplayCount()) + } + if p.GetRecoveryCount() != 0 && + p.GetRecoveryCount() != UINT64_MAX_VAL { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Recovery count", + p.GetRecoveryCount()) + } + if p.GetReplayRolloverCount() != 0 && + p.GetReplayRolloverCount() != UINT64_MAX_VAL { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Replay rollover count", + p.GetReplayRolloverCount()) + } + if p.GetNACKSentCount() != 0 && + p.GetNACKSentCount() != UINT64_MAX_VAL && + p.GetNACKSentCount() != UINT32_MAX_VAL_UINT64 { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "NACKs sent", + p.GetNACKSentCount()) + } + if p.GetNACKReceivedCount() != 0 && + p.GetNACKReceivedCount() != UINT64_MAX_VAL && + p.GetNACKReceivedCount() != UINT32_MAX_VAL_UINT64 { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "NACKs received", + p.GetNACKReceivedCount()) + } + if p.GetRxBytes() != UINT64_MAX_VAL { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Total received bytes", + p.GetRxBytes()) + } + if p.GetTxBytes() != UINT64_MAX_VAL { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Total transmitted bytes", + p.GetTxBytes()) + } + if p.GetBiDirBandwidth() != UINT16_MAX_VAL_UINT64 { + printPCIeHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", + "Bidirectional bandwidth (in GB/s)", + p.GetBiDirBandwidth()) + } + } + if stats.GetVRAMUsage() != nil { + printHdr = false + vram := stats.GetVRAMUsage() + if vram.GetTotalVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Total VRAM (in MB)", + vram.GetTotalVRAM()) + } + if vram.GetUsedVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Used VRAM (in MB)", + vram.GetUsedVRAM()) + } + if vram.GetFreeVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Free VRAM (in MB)", + vram.GetFreeVRAM()) + } + if vram.GetTotalVisibleVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Total visible VRAM (in MB)", + vram.GetTotalVisibleVRAM()) + } + if vram.GetUsedVisibleVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Used visible VRAM (in MB)", + vram.GetUsedVisibleVRAM()) + } + if vram.GetFreeVisibleVRAM() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Free visible VRAM (in MB)", + vram.GetFreeVisibleVRAM()) + } + if vram.GetTotalGTT() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Total GTT (in MB)", + vram.GetTotalGTT()) + } + if vram.GetUsedGTT() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Used GTT (in MB)", + vram.GetUsedGTT()) + } + if vram.GetFreeGTT() != 0 { + printVRAMUsageHdr(indent) + fmt.Printf(indent+" %-36s : %d\n", "Free GTT (in MB)", + vram.GetFreeGTT()) + } + } + if stats.GetEnergyConsumed() != 0 { + fmt.Printf(indent+"%-38s : %.2f\n", + "Accumulated energy consumed (in uJ)", + stats.GetEnergyConsumed()) + } + if stats.GetTotalCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Total correctable errors", + stats.GetTotalCorrectableErrors()) + } + if stats.GetTotalUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Total uncorrectable errors", + stats.GetTotalUncorrectableErrors()) + } + if stats.GetSDMACorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SDMA correctable errors", + stats.GetSDMACorrectableErrors()) + } + if stats.GetSDMAUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SDMA uncorrectable errors", + stats.GetSDMAUncorrectableErrors()) + } + if stats.GetGFXCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "GFX correctable errors", + stats.GetGFXCorrectableErrors()) + } + if stats.GetGFXUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "GFX uncorrectable errors", + stats.GetGFXUncorrectableErrors()) + } + if stats.GetMMHUBCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MMHUB correctable errors", + stats.GetMMHUBCorrectableErrors()) + } + if stats.GetMMHUBUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MMHUB uncorrectable errors", + stats.GetMMHUBUncorrectableErrors()) + } + if stats.GetATHUBCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "ATHUB correctable errors", + stats.GetATHUBCorrectableErrors()) + } + if stats.GetATHUBUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "ATHUB uncorrectable errors", + stats.GetATHUBUncorrectableErrors()) + } + if stats.GetBIFCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "BIF correctable errors", + stats.GetBIFCorrectableErrors()) + } + if stats.GetBIFUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "BIF uncorrectable errors", + stats.GetBIFUncorrectableErrors()) + } + if stats.GetHDPCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "HDP correctable errors", + stats.GetHDPCorrectableErrors()) + } + if stats.GetHDPUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "HDP uncorrectable errors", + stats.GetHDPUncorrectableErrors()) + } + if stats.GetXGMIWAFLCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "XGMI WAFL correctable errors", + stats.GetXGMIWAFLCorrectableErrors()) + } + if stats.GetXGMIWAFLUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "XGMI WAFL uncorrectable errors", + stats.GetXGMIWAFLUncorrectableErrors()) + } + if stats.GetDFCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "DF correctable errors", + stats.GetDFCorrectableErrors()) + } + if stats.GetDFUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "DF uncorrectable errors", + stats.GetDFUncorrectableErrors()) + } + if stats.GetSMNCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SMN correctable errors", + stats.GetSMNCorrectableErrors()) + } + if stats.GetSMNUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SMN uncorrectable errors", + stats.GetSMNUncorrectableErrors()) + } + if stats.GetSEMCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SEM correctable errors", + stats.GetSEMCorrectableErrors()) + } + if stats.GetSEMUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "SEM uncorrectable errors", + stats.GetSEMUncorrectableErrors()) + } + if stats.GetMP0CorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MP0 correctable errors", + stats.GetMP0CorrectableErrors()) + } + if stats.GetMP0UncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MP0 uncorrectable errors", + stats.GetMP0UncorrectableErrors()) + } + if stats.GetMP1CorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MP1 correctable errors", + stats.GetMP1CorrectableErrors()) + } + if stats.GetMP1UncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MP1 uncorrectable errors", + stats.GetMP1UncorrectableErrors()) + } + if stats.GetFUSECorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "FUSE correctable errors", + stats.GetFUSECorrectableErrors()) + } + if stats.GetFUSEUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "FUSE uncorrectable errors", + stats.GetFUSEUncorrectableErrors()) + } + if stats.GetUMCCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "UMC correctable errors", + stats.GetUMCCorrectableErrors()) + } + if stats.GetUMCUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "UMC uncorrectable errors", + stats.GetUMCUncorrectableErrors()) + } + if stats.GetMCACorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MCA correctable errors", + stats.GetMCACorrectableErrors()) + } + if stats.GetMCAUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MCA uncorrectable errors", + stats.GetMCAUncorrectableErrors()) + } + if stats.GetVCNCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "VCN correctable errors", + stats.GetVCNCorrectableErrors()) + } + if stats.GetVCNUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "VCN uncorrectable errors", + stats.GetVCNUncorrectableErrors()) + } + if stats.GetJPEGCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "JPEG correctable errors", + stats.GetJPEGCorrectableErrors()) + } + if stats.GetJPEGUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "JPEG uncorrectable errors", + stats.GetJPEGUncorrectableErrors()) + } + if stats.GetIHCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "IH correctable errors", + stats.GetIHCorrectableErrors()) + } + if stats.GetIHUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "IH uncorrectable errors", + stats.GetIHUncorrectableErrors()) + } + if stats.GetMPIOCorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MPIO correctable errors", + stats.GetMPIOCorrectableErrors()) + } + if stats.GetMPIOUncorrectableErrors() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "MPIO uncorrectable errors", + stats.GetMPIOUncorrectableErrors()) + } + if stats.GetXGMINeighbor0TxNOPs() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Nops sent to XGMI neighbor0", + stats.GetXGMINeighbor0TxNOPs()) + } + if stats.GetXGMINeighbor0TxRequests() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Outgoing requests to XGMI neighbor0", + stats.GetXGMINeighbor0TxRequests()) + } + if stats.GetXGMINeighbor0TxResponses() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Outgoing responses to XGMI neighbor0", + stats.GetXGMINeighbor0TxRequests()) + } + if stats.GetXGMINeighbor0TXBeats() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Data beats sent to neighbor0", + stats.GetXGMINeighbor0TXBeats()) + } + if stats.GetXGMINeighbor1TxNOPs() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Nops sent to XGMI neighbor1", + stats.GetXGMINeighbor1TxNOPs()) + } + if stats.GetXGMINeighbor1TxRequests() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Outgoing requests to XGMI neighbor1", + stats.GetXGMINeighbor1TxRequests()) + } + if stats.GetXGMINeighbor1TxResponses() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Outgoing responses to XGMI neighbor1", + stats.GetXGMINeighbor1TxRequests()) + } + if stats.GetXGMINeighbor1TXBeats() != 0 { + fmt.Printf(indent+"%-38s : %d\n", "Data beats sent to neighbor1", + stats.GetXGMINeighbor1TXBeats()) + } + if stats.GetXGMINeighbor0TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor0 (in BPS)", + stats.GetXGMINeighbor0TxThroughput()) + } + if stats.GetXGMINeighbor1TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor1 (in BPS)", + stats.GetXGMINeighbor1TxThroughput()) + } + if stats.GetXGMINeighbor2TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor2 (in BPS)", + stats.GetXGMINeighbor2TxThroughput()) + } + if stats.GetXGMINeighbor3TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor3 (in BPS)", + stats.GetXGMINeighbor3TxThroughput()) + } + if stats.GetXGMINeighbor4TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor4 (in BPS)", + stats.GetXGMINeighbor4TxThroughput()) + } + if stats.GetXGMINeighbor5TxThroughput() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Tx throughput to XGMI neighbor5 (in BPS)", + stats.GetXGMINeighbor5TxThroughput()) + } + if stats.GetPowerUsage() != 0 { + fmt.Printf(indent+"%-38s : %d\n", + "Power usage (in Watts)", stats.GetPowerUsage()) + } + if (stats.GetFanSpeed() != 0) && + (stats.GetFanSpeed() != UINT16_MAX_VAL_UINT64) { + fmt.Printf(indent+"%-38s : %d\n", + "Fan speed (in RPMs)", stats.GetFanSpeed()) + } + if (stats.GetGFXActivityAccumulated() != 0) && + (stats.GetGFXActivityAccumulated() != UINT64_MAX_VAL) { + fmt.Printf(indent+"%-38s : %d\n", + "GFX activity accumulated", + stats.GetGFXActivityAccumulated()) + } + if (stats.GetMemoryActivityAccumulated() != 0) && + (stats.GetMemoryActivityAccumulated() != UINT64_MAX_VAL) { + fmt.Printf(indent+"%-38s : %d\n", + "Memory activity accumulated", + stats.GetMemoryActivityAccumulated()) + } + for i, linkStats := range stats.GetXGMILinkStats() { + link := "Link " + fmt.Sprintf("%v", i+1) + if (linkStats.GetDataRead() != 0) && + (linkStats.GetDataRead() != UINT64_MAX_VAL) { + fmt.Printf(indent+"%-38s : %d\n", link+" data read (in KB)", + linkStats.GetDataRead()) + } + if (linkStats.GetDataWrite() != 0) && + (linkStats.GetDataWrite() != UINT64_MAX_VAL) { + fmt.Printf(indent+"%-38s : %d\n", link+" data written (in KB)", + linkStats.GetDataWrite()) + } + } + if stats.GetViolationStats() != nil { + vStats := stats.GetViolationStats() + if vStats.GetCurrentAccumulatedCounter() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", "Current accumulated counter", + vStats.GetCurrentAccumulatedCounter()) + } + if vStats.GetProcessorHotResidencyAccumulated() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", + "Processor hot residency accumulated", + vStats.GetProcessorHotResidencyAccumulated()) + } + if vStats.GetPPTResidencyAccumulated() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", "PPT residency accumulated", + vStats.GetPPTResidencyAccumulated()) + } + if vStats.GetSocketThermalResidencyAccumulated() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", + "Socket thermal residency accumulated", + vStats.GetSocketThermalResidencyAccumulated()) + } + if vStats.GetVRThermalResidencyAccumulated() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", + "VR thermal residency accumulated", + vStats.GetVRThermalResidencyAccumulated()) + } + if vStats.GetVRThermalResidencyAccumulated() != UINT64_MAX_VAL { + fmt.Printf(indent+"%-38s : %d\n", + "HBM thermal residency accumulated", + vStats.GetHBMThermalResidencyAccumulated()) + } + } + + fmt.Printf("\n%s\n", strings.Repeat("-", 80)) +} + +type ShadowGPU struct { + Id string + *aga.GPUSpec + *aga.GPUStatus + *aga.GPUStats +} + +func NewGPU(resp *aga.GPU) *ShadowGPU { + return &ShadowGPU{ + Id: utils.IdToStr(resp.GetSpec().GetId()), + GPUSpec: resp.GetSpec(), + GPUStatus: resp.GetStatus(), + GPUStats: resp.GetStats(), + } +} + +func printGPUJson(resp *aga.GPU) { + gpu := NewGPU(resp) + b, _ := json.MarshalIndent(gpu, " ", " ") + fmt.Printf(" %s", string(b)) +} + +func gpuUpdateCmdPreRunE(cmd *cobra.Command, args []string) error { + if cmd == nil { + return fmt.Errorf("Invalid argument") + } + if cmd.Flags().NFlag() == 1 { + return fmt.Errorf("Nothing to update") + } + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + if cmd.Flags().Changed("admin-state") { + switch strings.ToLower(gpuAdminState) { + case "up": + gpuAdminStateVal = aga.GPUAdminState_GPU_ADMIN_STATE_UP + case "down": + gpuAdminStateVal = aga.GPUAdminState_GPU_ADMIN_STATE_DOWN + default: + return fmt.Errorf("Invalid argument for \"admin-state\", please " + + "refer help") + } + } + if cmd.Flags().Changed("compute-partition") { + switch strings.ToLower(computePartition) { + case "spx": + computePartitionVal = + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_SPX + case "dpx": + computePartitionVal = + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_DPX + case "tpx": + computePartitionVal = + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_TPX + case "qpx": + computePartitionVal = + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_QPX + case "cpx": + computePartitionVal = + aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_CPX + default: + return fmt.Errorf("Invalid argument for \"compute-partition\", " + + "please refer help") + } + } + if cmd.Flags().Changed("memory-partition") { + switch strings.ToLower(memPartition) { + case "nps1": + memPartitionVal = + aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS1 + case "nps2": + memPartitionVal = + aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS2 + case "nps4": + memPartitionVal = + aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS4 + case "nps8": + memPartitionVal = + aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS8 + default: + return fmt.Errorf("Invalid argument for \"memory-partition\", " + + "please refer help") + } + } + if cmd.Flags().Changed("perf-level") { + switch strings.ToLower(perfLevel) { + case "none": + PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE + case "auto": + PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_AUTO + case "low": + PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_LOW + case "high": + PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_HIGH + case "deterministic": + PerformanceLevelVal = + aga.GPUPerformanceLevel_GPU_PERF_LEVEL_DETERMINISTIC + case "memclock": + PerformanceLevelVal = + aga.GPUPerformanceLevel_GPU_PERF_LEVEL_STABLE_MIN_MCLK + case "sysclock": + PerformanceLevelVal = + aga.GPUPerformanceLevel_GPU_PERF_LEVEL_STABLE_MIN_SCLK + case "manual": + PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_MANUAL + default: + return fmt.Errorf("Invalid argument for \"perf-level\", please " + + "refer help") + } + } + if cmd.Flags().Changed("clock-frequency") != + cmd.Flags().Changed("clock-type") { + return fmt.Errorf("Both \"clock-type\" and \"clock-frequency\" need " + + "to be specified") + } + if cmd.Flags().Changed("clock-type") { + switch strings.ToLower(gpuClkType) { + case "memory": + clockType = aga.GPUClockType_GPU_CLOCK_TYPE_MEMORY + case "system": + clockType = aga.GPUClockType_GPU_CLOCK_TYPE_SYSTEM + case "video": + clockType = aga.GPUClockType_GPU_CLOCK_TYPE_VIDEO + case "data": + clockType = aga.GPUClockType_GPU_CLOCK_TYPE_DATA + default: + return fmt.Errorf("Invalid \"clock-type\" specified, please " + + "refer help") + } + } + if cmd.Flags().Changed("clock-frequency") { + _, err := fmt.Sscanf(gpuClkFreq, "%d-%d", &gpuClkFreqLo, &gpuClkFreqHi) + if err != nil { + return fmt.Errorf("Invalid range for \"clock-frequency\", please " + + "refer help") + } + } + return nil +} + +func gpuUpdateCmdHandler(cmd *cobra.Command, args []string) error { + if cmd == nil { + return fmt.Errorf("Invalid argument") + } + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + cmd.SilenceUsage = true + + // get GPU spec + respMsg := &aga.GPUGetResponse{} + var req *aga.GPUGetRequest + if cmd.Flags().Changed("id") { + // get specific GPU + req = &aga.GPUGetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + } + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + client := aga.NewGPUSvcClient(c) + respMsg, err = client.GPUGet(ctxt, req) + if err != nil { + return fmt.Errorf("Getting GPU failed, err %v", err) + } + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Get GPU failed with %v error", respMsg.ApiStatus) + } + resp := respMsg.GetResponse()[0] + gpuSpec := resp.GetSpec() + if gpuSpec == nil { + return fmt.Errorf("GPU object not found") + } + // update the configured fields + updateSpec := *gpuSpec + if cmd.Flags().Changed("admin-state") { + updateSpec.AdminState = gpuAdminStateVal + } + if cmd.Flags().Changed("overdrive-level") { + updateSpec.OverDriveLevel = overDriveLevel + } + if cmd.Flags().Changed("power-cap") { + updateSpec.GPUPowerCap = powerCap + } + if cmd.Flags().Changed("perf-level") { + updateSpec.PerformanceLevel = PerformanceLevelVal + } + if cmd.Flags().Changed("clock-frequency") { + for i, freq := range updateSpec.GetClockFrequency() { + if freq.GetClockType() == clockType { + updateSpec.ClockFrequency[i] = &aga.GPUClockFrequencyRange{ + ClockType: clockType, + LowFrequency: gpuClkFreqLo, + HighFrequency: gpuClkFreqHi, + } + } + } + } + if cmd.Flags().Changed("fan-speed") { + updateSpec.FanSpeed = fanSpeed + } + if cmd.Flags().Changed("compute-partition") { + updateSpec.ComputePartitionType = computePartitionVal + } + if cmd.Flags().Changed("memory-partition") { + updateSpec.MemoryPartitionType = memPartitionVal + } + reqMsg := &aga.GPUUpdateRequest{ + Spec: []*aga.GPUSpec{ + &updateSpec, + }, + } + // GPU agent call + updateRespMsg, err := client.GPUUpdate(ctxt, reqMsg) + if err != nil { + return fmt.Errorf("Updating GPU failed, err %v", err) + } + if updateRespMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with error %v, error code %v", + updateRespMsg.ApiStatus, updateRespMsg.ErrorCode) + } + fmt.Printf("Updating GPU succeeded\n") + return nil +} + +func gpuResetCmdPreRunE(cmd *cobra.Command, args []string) error { + if cmd == nil { + return fmt.Errorf("Invalid argument") + } + if err := utils.IsUUIDValid(gpuID); err != nil { + return err + } + numFlags := 0 + if cmd.Flags().Changed("clocks") { + numFlags += 1 + } + if cmd.Flags().Changed("fans") { + numFlags += 1 + } + if cmd.Flags().Changed("power-profile") { + numFlags += 1 + } + if cmd.Flags().Changed("power-overdrive") { + numFlags += 1 + } + if cmd.Flags().Changed("xgmi-error") { + numFlags += 1 + } + if cmd.Flags().Changed("perf-determinism") { + numFlags += 1 + } + if cmd.Flags().Changed("compute-partition") { + numFlags += 1 + } + if cmd.Flags().Changed("nps-mode") { + numFlags += 1 + } + if numFlags == 0 { + // more than 1 reset option is specified, reject + return fmt.Errorf("Invalid arguments, one of \"clocks\", \"fans\", " + + "\"power-profile\", \"power-overdrive\", \"xgmi-error\", " + + "\"perf-determinism\", \"compute-partition\", \"nps-mode\" must " + + "be specified") + } + // all above options are mutually exclusive + if numFlags > 1 { + // more than 1 reset option is specified, reject + return fmt.Errorf("Invalid arguments, \"clocks\", \"fans\", " + + "\"power-profile\", \"power-overdrive\", \"xgmi-error\", " + + "\"perf-determinism\", \"compute-partition\", \"nps-mode\" are " + + "mutually exlcusive, specify only one") + } + return nil +} + +func gpuResetCmdHandler(cmd *cobra.Command, args []string) error { + if cmd == nil { + return fmt.Errorf("Invalid argument") + } + if len(args) > 0 { + return fmt.Errorf("Invalid argument") + } + cmd.SilenceUsage = true + + // connect to GPU agent + c, ctxt, cancel, err := utils.CreateNewAGAGRPClient() + if err != nil { + return fmt.Errorf("Could not connect to the GPU agent, is agent " + + "running?") + } + defer c.Close() + defer cancel() + + req := &aga.GPUResetRequest{ + Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()}, + } + if cmd.Flags().Changed("clocks") { + req.Reset_ = &aga.GPUResetRequest_ResetClocks{ + ResetClocks: true, + } + } else if cmd.Flags().Changed("fans") { + req.Reset_ = &aga.GPUResetRequest_ResetFans{ + ResetFans: true, + } + } else if cmd.Flags().Changed("power-profile") { + req.Reset_ = &aga.GPUResetRequest_ResetPowerProfile{ + ResetPowerProfile: true, + } + } else if cmd.Flags().Changed("power-overdrive") { + req.Reset_ = &aga.GPUResetRequest_ResetPowerOverDrive{ + ResetPowerOverDrive: true, + } + } else if cmd.Flags().Changed("xgmi-error") { + req.Reset_ = &aga.GPUResetRequest_ResetXGMIError{ + ResetXGMIError: true, + } + } else if cmd.Flags().Changed("perf-determinism") { + req.Reset_ = &aga.GPUResetRequest_ResetPerfDeterminism{ + ResetPerfDeterminism: true, + } + } else if cmd.Flags().Changed("compute-partition") { + req.Reset_ = &aga.GPUResetRequest_ResetComputePartition{ + ResetComputePartition: true, + } + } else if cmd.Flags().Changed("nps-mode") { + req.Reset_ = &aga.GPUResetRequest_ResetNPSMode{ + ResetNPSMode: true, + } + } + client := aga.NewGPUSvcClient(c) + respMsg, err := client.GPUReset(ctxt, req) + if err != nil { + return fmt.Errorf("Resetting GPU failed, err %v", err) + } + if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK { + return fmt.Errorf("Operation failed with error %v, error code %v", + respMsg.ApiStatus, respMsg.ErrorCode) + } + fmt.Printf("Resetting GPU succeeded\n") + return nil +} diff --git a/sw/nic/gpuagent/protos/gpu.proto b/sw/nic/gpuagent/protos/gpu.proto index dd8a2e2..a5d9e26 100644 --- a/sw/nic/gpuagent/protos/gpu.proto +++ b/sw/nic/gpuagent/protos/gpu.proto @@ -542,19 +542,46 @@ message GPUXGMILinkStats { // GPU violation stats message GPUViolationStats { // current acummulated counter - uint64 CurrentAccumulatedCounter = 1; + uint64 CurrentAccumulatedCounter = 1; // processor hot residency accumulated - uint64 ProcessorHotResidencyAccumulated = 2; + uint64 ProcessorHotResidencyAccumulated = 2; // Package Power Tracking (PPT) residency accumulated - uint64 PPTResidencyAccumulated = 3; + uint64 PPTResidencyAccumulated = 3; // socket thermal residency accumulated - uint64 SocketThermalResidencyAccumulated = 4; + uint64 SocketThermalResidencyAccumulated = 4; // Voltage Rail (VR) thermal residency accumulated - uint64 VRThermalResidencyAccumulated = 5; + uint64 VRThermalResidencyAccumulated = 5; // High Bandwidth Memory (HBM) thermal residency accumulated - uint64 HBMThermalResidencyAccumulated = 6; + uint64 HBMThermalResidencyAccumulated = 6; + // processor hot residency percentage + uint64 ProcessorHotResidencyPercentage = 7; + // Package Power Tracking (PPT) residency percentage + uint64 PPTResidencyPercentage = 8; + // socket thermal residency percentage + uint64 SocketThermalResidencyPercentage = 9; + // Voltage Rail (VR) thermal residency percentage + uint64 VRThermalResidencyPercentage = 10; + // High Bandwidth Memory (HBM) thermal residency percentage + uint64 HBMThermalResidencyPercentage = 11; + // gfx clock below host limit power accumulated + repeated uint64 GFXBelowHostLimitPowerAccumulated = 12; + // gfx clock below host limit thermal accumulated + repeated uint64 GFXBelowHostLimitTHMAccumulated = 13; + // gfx low utilization accumulated + repeated uint64 GFXLowUtilizationAccumulated = 14; + // gfx clock below host limit total accumulated + repeated uint64 GFXBelowHostLimitTotalAccumulated = 15; + // gfx clock below host limit power percentage + repeated uint64 GFXBelowHostLimitPowerPercentage = 16; + // gfx clock below host limit thermal percentage + repeated uint64 GFXBelowHostLimitTHMPercentage = 17; + // gfx low utilization percentage + repeated uint64 GFXLowUtilizationPercentage = 18; + // gfx below host limit total percentage + repeated uint64 GFXBelowHostLimitTotalPercentage = 19; } + // GPU statistics message GPUStats { // current graphics package power (in Watts) diff --git a/sw/nic/gpuagent/svc/gpu_to_proto.hpp b/sw/nic/gpuagent/svc/gpu_to_proto.hpp index 479887d..7cad3f1 100644 --- a/sw/nic/gpuagent/svc/gpu_to_proto.hpp +++ b/sw/nic/gpuagent/svc/gpu_to_proto.hpp @@ -610,6 +610,34 @@ aga_gpu_violation_stats_to_proto (amdgpu::GPUViolationStats *proto_stats, stats->vr_thermal_residency_accumulated); proto_stats->set_hbmthermalresidencyaccumulated( stats->hbm_thermal_residency_accumulated); + proto_stats->set_processorhotresidencypercentage( + stats->processor_hot_residency_percentage); + proto_stats->set_pptresidencypercentage( + stats->ppt_residency_percentage); + proto_stats->set_socketthermalresidencypercentage( + stats->socket_thermal_residency_percentage); + proto_stats->set_vrthermalresidencypercentage( + stats->vr_thermal_residency_percentage); + proto_stats->set_hbmthermalresidencypercentage( + stats->hbm_thermal_residency_percentage); + for (uint16_t i = 0; i < AGA_GPU_MAX_XCC; i++) { + proto_stats->add_gfxbelowhostlimitpoweraccumulated( + stats->gfx_clk_below_host_limit_power_accumulated[i]); + proto_stats->add_gfxbelowhostlimitthmaccumulated( + stats->gfx_clk_below_host_limit_thermal_accumulated[i]); + proto_stats->add_gfxlowutilizationaccumulated( + stats->gfx_low_utilization_accumulated[i]); + proto_stats->add_gfxbelowhostlimittotalaccumulated( + stats->gfx_clk_below_host_limit_total_accumulated[i]); + proto_stats->add_gfxbelowhostlimitpowerpercentage( + stats->gfx_clk_below_host_limit_power_percentage[i]); + proto_stats->add_gfxbelowhostlimitthmpercentage( + stats->gfx_clk_below_host_limit_thermal_percentage[i]); + proto_stats->add_gfxlowutilizationpercentage( + stats->gfx_low_utilization_percentage[i]); + proto_stats->add_gfxbelowhostlimittotalpercentage( + stats->gfx_clk_below_host_limit_total_percentage[i]); + } } // populate proto buf stats from gpu stats diff --git a/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig b/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig new file mode 100644 index 0000000..479887d --- /dev/null +++ b/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig @@ -0,0 +1,824 @@ + +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + + +//---------------------------------------------------------------------------- +/// +/// \file +/// This module defines protobuf conversion APIs for gpu object +/// +//---------------------------------------------------------------------------- + +#ifndef __AGA_SVC_GPU_TO_PROTO_HPP__ +#define __AGA_SVC_GPU_TO_PROTO_HPP__ + +#include +#include "nic/gpuagent/svc/gpu.hpp" +#include "nic/gpuagent/api/include/aga_gpu.hpp" + +static inline amdgpu::GPUAdminState +aga_gpu_admin_state_to_proto (aga_gpu_admin_state_t admin_state) +{ + switch(admin_state) { + case AGA_GPU_ADMIN_STATE_UP: + return amdgpu::GPU_ADMIN_STATE_UP; + case AGA_GPU_ADMIN_STATE_DOWN: + return amdgpu::GPU_ADMIN_STATE_DOWN; + case AGA_GPU_ADMIN_STATE_NONE: + default: + break; + } + return amdgpu::GPU_ADMIN_STATE_NONE; +} + +static inline amdgpu::GPUPerformanceLevel +aga_gpu_perf_level_to_proto (aga_gpu_perf_level_t perf_level) +{ + switch (perf_level) { + case AGA_GPU_PERF_LEVEL_AUTO: + return amdgpu::GPU_PERF_LEVEL_AUTO; + case AGA_GPU_PERF_LEVEL_LOW: + return amdgpu::GPU_PERF_LEVEL_LOW; + case AGA_GPU_PERF_LEVEL_HIGH: + return amdgpu::GPU_PERF_LEVEL_HIGH; + case AGA_GPU_PERF_LEVEL_DETERMINISTIC: + return amdgpu::GPU_PERF_LEVEL_DETERMINISTIC; + case AGA_GPU_PERF_LEVEL_STABLE_WITH_MCLK: + return amdgpu::GPU_PERF_LEVEL_STABLE_MIN_MCLK; + case AGA_GPU_PERF_LEVEL_STABLE_WITH_SCLK: + return amdgpu::GPU_PERF_LEVEL_STABLE_MIN_SCLK; + case AGA_GPU_PERF_LEVEL_MANUAL: + return amdgpu::GPU_PERF_LEVEL_MANUAL; + case AGA_GPU_PERF_LEVEL_NONE: + default: + break; + } + return amdgpu::GPU_PERF_LEVEL_NONE; +} + +static inline amdgpu::GPUClockType +aga_gpu_clock_type_to_proto (aga_gpu_clock_type_t type) +{ + switch (type) { + case AGA_GPU_CLOCK_TYPE_FABRIC: + return amdgpu::GPU_CLOCK_TYPE_FABRIC; + case AGA_GPU_CLOCK_TYPE_MEMORY: + return amdgpu::GPU_CLOCK_TYPE_MEMORY; + case AGA_GPU_CLOCK_TYPE_SYSTEM: + return amdgpu::GPU_CLOCK_TYPE_SYSTEM; + case AGA_GPU_CLOCK_TYPE_SOC: + return amdgpu::GPU_CLOCK_TYPE_SOC; + case AGA_GPU_CLOCK_TYPE_DCE: + return amdgpu::GPU_CLOCK_TYPE_DCE; + case AGA_GPU_CLOCK_TYPE_PCIE: + return amdgpu::GPU_CLOCK_TYPE_PCIE; + case AGA_GPU_CLOCK_TYPE_VIDEO: + return amdgpu::GPU_CLOCK_TYPE_VIDEO; + case AGA_GPU_CLOCK_TYPE_DATA: + return amdgpu::GPU_CLOCK_TYPE_DATA; + case AGA_GPU_CLOCK_TYPE_NONE: + default: + break; + } + return amdgpu::GPU_CLOCK_TYPE_NONE; +} + +static inline amdgpu::GPUThrottlingStatus +aga_gpu_throttling_status_to_proto (aga_gpu_throttling_status_t status) +{ + switch (status) { + case AGA_GPU_THROTTLING_STATUS_OFF: + return amdgpu::GPU_THROTTLING_STATUS_OFF; + case AGA_GPU_THROTTLING_STATUS_ON: + return amdgpu::GPU_THROTTLING_STATUS_ON; + case AGA_GPU_THROTTLING_STATUS_NONE: + default: + break; + } + return amdgpu::GPU_THROTTLING_STATUS_NONE; +} + +static inline amdgpu::GPUVirtualizationMode +aga_gpu_virtualization_mode_to_proto (aga_gpu_virtualization_mode_t mode) +{ + switch (mode) { + case AGA_VIRTUALIZATION_MODE_BAREMETAL: + return amdgpu::GPU_VIRTUALIZATION_MODE_BAREMETAL; + case AGA_VIRTUALIZATION_MODE_HOST: + return amdgpu::GPU_VIRTUALIZATION_MODE_HOST; + case AGA_VIRTUALIZATION_MODE_GUEST: + return amdgpu::GPU_VIRTUALIZATION_MODE_GUEST; + case AGA_VIRTUALIZATION_MODE_PASSTHROUGH: + return amdgpu::GPU_VIRTUALIZATION_MODE_PASSTHROUGH; + default: + break; + } + return amdgpu::GPU_VIRTUALIZATION_MODE_NONE; +} + +static inline void +aga_gpu_clock_spec_to_proto (GPUClockFrequencyRange *proto_spec, + const aga_gpu_clock_freq_range_t *spec) +{ + proto_spec->set_clocktype(aga_gpu_clock_type_to_proto(spec->clock_type)); + proto_spec->set_lowfrequency(spec->lo); + proto_spec->set_highfrequency(spec->hi); +} + +static inline amdgpu::GPUComputePartitionType +aga_gpu_compute_partition_type_to_proto (aga_gpu_compute_partition_type_t type) +{ + switch (type) { + case AGA_GPU_COMPUTE_PARTITION_TYPE_SPX: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_SPX; + case AGA_GPU_COMPUTE_PARTITION_TYPE_DPX: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_DPX; + case AGA_GPU_COMPUTE_PARTITION_TYPE_TPX: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_TPX; + case AGA_GPU_COMPUTE_PARTITION_TYPE_QPX: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_QPX; + case AGA_GPU_COMPUTE_PARTITION_TYPE_CPX: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_CPX; + default: + return amdgpu::GPU_COMPUTE_PARTITION_TYPE_NONE; + } +} + +static inline amdgpu::GPUMemoryPartitionType +aga_gpu_memory_partition_type_to_proto (aga_gpu_memory_partition_type_t type) +{ + switch (type) { + case AGA_GPU_MEMORY_PARTITION_TYPE_NPS1: + return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS1; + case AGA_GPU_MEMORY_PARTITION_TYPE_NPS2: + return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS2; + case AGA_GPU_MEMORY_PARTITION_TYPE_NPS4: + return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS4; + case AGA_GPU_MEMORY_PARTITION_TYPE_NPS8: + return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS8; + default: + return amdgpu::GPU_MEMORY_PARTITION_TYPE_NONE; + } +} + +// populate proto buf spec from gpu API spec +static inline void +aga_gpu_api_spec_to_proto (GPUSpec *proto_spec, + const aga_gpu_spec_t *spec) +{ + proto_spec->set_id(spec->key.id, OBJ_MAX_KEY_LEN); + proto_spec->set_adminstate(aga_gpu_admin_state_to_proto(spec->admin_state)); + proto_spec->set_overdrivelevel(spec->overdrive_level); + proto_spec->set_gpupowercap(spec->gpu_power_cap); + proto_spec->set_performancelevel(aga_gpu_perf_level_to_proto( + spec->perf_level)); + for (uint32_t i = 0; i < spec->num_clock_freqs; i++) { + aga_gpu_clock_spec_to_proto(proto_spec->add_clockfrequency(), + &spec->clock_freq[i]); + } + proto_spec->set_fanspeed(spec->fan_speed); + proto_spec->set_computepartitiontype( + aga_gpu_compute_partition_type_to_proto( + spec->compute_partition_type)); + proto_spec->set_memorypartitiontype( + aga_gpu_memory_partition_type_to_proto( + spec->memory_partition_type)); + // TODO: fill gpu RAS spec +} + +static inline void +aga_gpu_fw_version_to_proto (GPUStatus *proto_status, + const aga_gpu_status_t *status) +{ + for (uint32_t i = 0; i < status->num_fw_versions; i++) { + auto fw_ver = proto_status->add_firmwareversion(); + fw_ver->set_firmware(status->fw_version[i].firmware); + fw_ver->set_version(status->fw_version[i].version); + } +} + +static inline void +aga_gpu_clock_status_to_proto (GPUStatus *proto_status, + const aga_gpu_status_t *status) +{ + for (uint32_t i = 0; i < status->num_clock_status; i++) { + auto clk_status = proto_status->add_clockstatus(); + clk_status->set_type(aga_gpu_clock_type_to_proto( + status->clock_status[i].clock_type)); + clk_status->set_frequency(status->clock_status[i].frequency); + clk_status->set_lowfrequency(status->clock_status[i].low_frequency); + clk_status->set_highfrequency(status->clock_status[i].high_frequency); + clk_status->set_locked(status->clock_status[i].locked); + clk_status->set_deepsleep(status->clock_status[i].deep_sleep); + } +} +static inline void +aga_gpu_voltage_curve_point_to_proto (GPUStatus *proto_status, + const aga_gpu_status_t *status) +{ + // voltage-curve-point proto message is currently not defined in status +} + +static inline amdgpu::GPUOperStatus +aga_gpu_oper_status_to_proto (aga_gpu_oper_state_t oper_status) +{ + switch (oper_status) { + case AGA_GPU_OPER_STATE_UP: + return amdgpu::GPU_OPER_STATUS_UP; + case AGA_GPU_OPER_STATE_DOWN: + return amdgpu::GPU_OPER_STATUS_DOWN; + case AGA_GPU_OPER_STATE_NONE: + default: + break; + } + return amdgpu::GPU_OPER_STATUS_NONE; +} + +static inline amdgpu::GPUXGMIErrorStatus +aga_gpu_xgmi_error_status_to_proto (aga_gpu_xgmi_error_status_t xgmi_status) +{ + switch (xgmi_status) { + case AGA_GPU_XGMI_STATUS_NO_ERROR: + return amdgpu::GPU_XGMI_STATUS_NO_ERROR; + case AGA_GPU_XGMI_STATUS_ONE_ERROR: + return amdgpu::GPU_XGMI_STATUS_ONE_ERROR; + case AGA_GPU_XGMI_STATUS_MULTIPLE_ERROR: + return amdgpu::GPU_XGMI_STATUS_MULTIPLE_ERROR; + default: + break; + } + return amdgpu::GPU_XGMI_STATUS_NONE; +} + +static inline void +aga_gpu_xgmi_status_to_proto (amdgpu::GPUXGMIStatus *proto_status, + const aga_gpu_xgmi_status_t *status) +{ + proto_status->set_errorstatus( + aga_gpu_xgmi_error_status_to_proto(status->error_status)); + proto_status->set_width(status->width); + proto_status->set_speed(status->speed); +} + +static inline amdgpu::PCIeSlotType +aga_gpu_pcie_slot_type_to_proto (aga_pcie_slot_type_t slot_type) +{ + switch (slot_type) { + case AGA_PCIE_SLOT_TYPE_PCIE: + return amdgpu::PCIE_SLOT_TYPE_PCIE; + case AGA_PCIE_SLOT_TYPE_OAM: + return amdgpu::PCIE_SLOT_TYPE_OAM; + case AGA_PCIE_SLOT_TYPE_CEM: + return amdgpu::PCIE_SLOT_TYPE_CEM; + case AGA_PCIE_SLOT_TYPE_UNKNOWN: + return amdgpu::PCIE_SLOT_TYPE_UNKNOWN; + default: + return amdgpu::PCIE_SLOT_TYPE_NONE; + } +} + +// populte PCIe status proto +static inline void +aga_gpu_pcie_status_to_proto (GPUPCIeStatus *proto_status, + const aga_gpu_pcie_status_t *status) +{ + proto_status->set_slottype( + aga_gpu_pcie_slot_type_to_proto(status->slot_type)); + proto_status->set_pciebusid(status->pcie_bus_id); + proto_status->set_maxwidth(status->max_width); + proto_status->set_maxspeed(status->max_speed); + proto_status->set_version(status->version); + proto_status->set_width(status->width); + proto_status->set_speed(status->speed); + proto_status->set_bandwidth(status->bandwidth); +} + +static inline amdgpu::VRAMType +aga_gpu_vram_type_to_proto (aga_vram_type_t type) +{ + switch (type) { + case AGA_VRAM_TYPE_HBM: + return amdgpu::VRAM_TYPE_HBM; + case AGA_VRAM_TYPE_HBM2: + return amdgpu::VRAM_TYPE_HBM2; + case AGA_VRAM_TYPE_HBM2E: + return amdgpu::VRAM_TYPE_HBM2E; + case AGA_VRAM_TYPE_HBM3: + return amdgpu::VRAM_TYPE_HBM3; + case AGA_VRAM_TYPE_DDR2: + return amdgpu::VRAM_TYPE_DDR2; + case AGA_VRAM_TYPE_DDR3: + return amdgpu::VRAM_TYPE_DDR3; + case AGA_VRAM_TYPE_DDR4: + return amdgpu::VRAM_TYPE_DDR4; + case AGA_VRAM_TYPE_GDDR1: + return amdgpu::VRAM_TYPE_GDDR1; + case AGA_VRAM_TYPE_GDDR2: + return amdgpu::VRAM_TYPE_GDDR2; + case AGA_VRAM_TYPE_GDDR3: + return amdgpu::VRAM_TYPE_GDDR3; + case AGA_VRAM_TYPE_GDDR4: + return amdgpu::VRAM_TYPE_GDDR4; + case AGA_VRAM_TYPE_GDDR5: + return amdgpu::VRAM_TYPE_GDDR5; + case AGA_VRAM_TYPE_GDDR6: + return amdgpu::VRAM_TYPE_GDDR6; + case AGA_VRAM_TYPE_GDDR7: + return amdgpu::VRAM_TYPE_GDDR7; + case AGA_VRAM_TYPE_UNKNOWN: + return amdgpu::VRAM_TYPE_UNKNOWN; + default: + return amdgpu::VRAM_TYPE_NONE; + } +} + + +// populate VRAM status proto +static inline void +aga_gpu_vram_status_to_proto (GPUVRAMStatus *proto_status, + const aga_gpu_vram_status_t *status) +{ + proto_status->set_type(aga_gpu_vram_type_to_proto(status->type)); + proto_status->set_vendor(status->vendor); + proto_status->set_size(status->size); +} + +static inline amdgpu::GPUPageStatus +aga_gpu_page_status_to_proto (aga_gpu_page_status_t page_status) +{ + switch (page_status) { + case AGA_GPU_PAGE_STATUS_RESERVED: + return amdgpu::GPU_PAGE_STATUS_RESERVED; + case AGA_GPU_PAGE_STATUS_PENDING: + return amdgpu::GPU_PAGE_STATUS_PENDING; + case AGA_GPU_PAGE_STATUS_UNRESERVABLE: + return amdgpu::GPU_PAGE_STATUS_UNRESERVABLE; + case AGA_GPU_PAGE_STATUS_NONE: + default: + break; + } + return amdgpu::GPU_PAGE_STATUS_NONE; +} + +// populate proto buf status from gpu status +static inline void +aga_gpu_api_status_to_proto (GPUStatus *proto_status, + const aga_gpu_status_t *status) +{ + proto_status->set_index(status->index); + proto_status->set_gpuhandle((uint64_t)status->handle); + proto_status->set_serialnum(status->serial_num); + proto_status->set_cardseries(status->card_series); + proto_status->set_cardmodel(status->card_model); + proto_status->set_cardvendor(status->card_vendor); + proto_status->set_cardsku(status->card_sku); + proto_status->set_driverversion(status->driver_version); + proto_status->set_vbiosversion(status->vbios_version); + proto_status->set_vbiospartnumber(status->vbios_part_number); + aga_gpu_fw_version_to_proto(proto_status, status); + proto_status->set_memoryvendor(status->memory_vendor); + proto_status->set_operstatus( + aga_gpu_oper_status_to_proto(status->oper_status)); + aga_gpu_clock_status_to_proto(proto_status, status); + for (uint32_t i = 0; i < status->num_kfd_process_id; i++) { + if (status->kfd_process_id[i]) { + // copy only non-zero process ids only + proto_status->add_kfdprocessid(status->kfd_process_id[i]); + } + } + // TODO: fill RAS status + aga_gpu_xgmi_status_to_proto(proto_status->mutable_xgmistatus(), + &status->xgmi_status); + aga_gpu_voltage_curve_point_to_proto(proto_status, status); + aga_gpu_vram_status_to_proto(proto_status->mutable_vramstatus(), + &status->vram_status); + aga_gpu_pcie_status_to_proto(proto_status->mutable_pciestatus(), + &status->pcie_status); + proto_status->set_throttlingstatus(aga_gpu_throttling_status_to_proto( + status->throttling_status)); + proto_status->set_fwtimestamp(status->fw_timestamp); + proto_status->set_partitionid(status->partition_id); + proto_status->set_virtualizationmode(aga_gpu_virtualization_mode_to_proto( + status->virtualization_mode)); + for (uint32_t i = 0; i < status->num_gpu_partition; i++) { + if (status->gpu_partition[i].valid()) { + proto_status->add_gpupartition(status->gpu_partition[i].id, + OBJ_MAX_KEY_LEN); + } + } + if (status->physical_gpu.valid()) { + proto_status->set_physicalgpu(status->physical_gpu.id, OBJ_MAX_KEY_LEN); + } + proto_status->set_kfdid(status->kfd_id); + proto_status->set_nodeid(status->node_id); + proto_status->set_drmrenderid(status->drm_render_id); + proto_status->set_drmcardid(status->drm_card_id); +} + +// populate gpu bad page records proto buf +static inline void +aga_gpu_bad_page_api_info_to_proto (uint32_t num_bad_pages, + aga_gpu_bad_page_record_t *records, + void *ctxt) +{ + streaming_get_ctxt_t *get_ctxt; + GPUBadPageGetResponse *proto_rsp; + grpc::ServerWriter *writer; + + get_ctxt = (streaming_get_ctxt_t *)ctxt; + proto_rsp = (GPUBadPageGetResponse *)get_ctxt->msg_ctxt; + writer = (grpc::ServerWriter *)get_ctxt->writer_ctxt; + + for (uint32_t i = 0; i < num_bad_pages; i++) { + get_ctxt->count++; + auto proto_record = proto_rsp->add_record(); + proto_record->set_gpu(records[i].key.id, OBJ_MAX_KEY_LEN); + proto_record->set_pageaddress(records[i].page_address); + proto_record->set_pagesize(records[i].page_size); + proto_record->set_pagestatus( + aga_gpu_page_status_to_proto(records[i].page_status)); + if (proto_rsp->record_size() == AGA_MAX_STREAMING_RSP_SIZE) { + proto_rsp->set_apistatus(sdk_ret_to_api_status(SDK_RET_OK)); + proto_rsp->set_errorcode(sdk_ret_to_error_code(SDK_RET_OK)); + if (!writer->Write(*proto_rsp)) { + AGA_TRACE_ERR("Failed to write gpu bad page info to gRPC " + "stream"); + } + proto_rsp->Clear(); + } + } +} + +// populate gpu compute partition get response proto buf +static inline void +aga_gpu_compute_partition_info_to_proto ( + aga_gpu_compute_partition_info_t *info, void *ctxt) +{ + GPUComputePartitionGetResponse *proto_rsp = + (GPUComputePartitionGetResponse *)ctxt; + + auto resp = proto_rsp->add_response(); + resp->set_id(info->physical_gpu.id, OBJ_MAX_KEY_LEN); + resp->set_partitiontype(aga_gpu_compute_partition_type_to_proto( + info->partition_type)); + for (uint32_t i = 0; i < info->num_gpu_partition; i++) { + if (info->gpu_partition[i].valid()) { + resp->add_gpupartition(info->gpu_partition[i].id, + OBJ_MAX_KEY_LEN); + } + } +} + +// populate gpu memory partition get response proto buf +static inline void +aga_gpu_memory_partition_info_to_proto ( + aga_gpu_memory_partition_info_t *info, void *ctxt) +{ + GPUMemoryPartitionGetResponse *proto_rsp = + (GPUMemoryPartitionGetResponse *)ctxt; + + auto resp = proto_rsp->add_response(); + resp->set_id(info->physical_gpu.id, OBJ_MAX_KEY_LEN); + resp->set_partitiontype(aga_gpu_memory_partition_type_to_proto( + info->partition_type)); +} + +// populate temperature proto buf stats from gpu stats +static inline void +aga_gpu_temp_stats_to_proto (amdgpu::GPUTemperatureStats *proto_stats, + const aga_gpu_temperature_stats_t *stats) +{ + proto_stats->add_hbmtemperature(stats->hbm_temperature[0]); + proto_stats->add_hbmtemperature(stats->hbm_temperature[1]); + proto_stats->add_hbmtemperature(stats->hbm_temperature[2]); + proto_stats->add_hbmtemperature(stats->hbm_temperature[3]); + proto_stats->set_edgetemperature(stats->edge_temperature); + proto_stats->set_junctiontemperature(stats->junction_temperature); + proto_stats->set_memorytemperature(stats->memory_temperature); +} + +// populate proto gpu usage stats from gpu stats +static inline void +aga_gpu_usage_stats_to_proto (GPUUsage *proto_stats, + const aga_gpu_usage_t *stats) +{ + proto_stats->set_gfxactivity(stats->gfx_activity); + proto_stats->set_umcactivity(stats->umc_activity); + proto_stats->set_mmactivity(stats->mm_activity); + for (uint16_t i = 0; i < AGA_GPU_MAX_VCN; i++) { + proto_stats->add_vcnactivity(stats->vcn_activity[i]); + proto_stats->add_vcnbusyinst(stats->vcn_busy[i]); + } + for (uint16_t i = 0; i < AGA_GPU_MAX_JPEG; i++) { + proto_stats->add_jpegactivity(stats->jpeg_activity[i]); + } + for (uint16_t i = 0; i < AGA_GPU_MAX_JPEG_ENG; i++) { + proto_stats->add_jpegbusyinst(stats->jpeg_busy[i]); + } + for (uint16_t i = 0; i < AGA_GPU_MAX_XCC; i++) { + proto_stats->add_gfxbusyinst(stats->gfx_busy_inst[i]); + } +} + +// populate proto memory usage stats from gpu stats +static inline void +aga_gpu_memory_usage_stats_to_proto (GPUMemoryUsage *proto_stats, + const aga_gpu_memory_usage_t *stats) +{ + proto_stats->set_memoryusage(stats->memory_usage); + proto_stats->set_activity(stats->activity); +} + +// populte PCIe stats proto +static inline void +aga_gpu_pcie_stats_to_proto (GPUPCIeStats *proto_stats, + const aga_gpu_pcie_stats_t *stats) +{ + proto_stats->set_replaycount(stats->replay_count); + proto_stats->set_recoverycount(stats->recovery_count); + proto_stats->set_replayrollovercount(stats->replay_rollover_count); + proto_stats->set_nacksentcount(stats->nack_sent_count); + proto_stats->set_nackreceivedcount(stats->nack_received_count); + proto_stats->set_rxbytes(stats->rx_bytes); + proto_stats->set_txbytes(stats->tx_bytes); + proto_stats->set_bidirbandwidth(stats->bidir_bandwidth); +} + +// populte VRAM usage stats proto +static inline void +aga_gpu_vram_usage_stats_to_proto (GPUVRAMUsage *proto_stats, + const aga_gpu_vram_usage_t *stats) +{ + proto_stats->set_totalvram(stats->total_vram); + proto_stats->set_usedvram(stats->used_vram); + proto_stats->set_freevram(stats->free_vram); + proto_stats->set_totalvisiblevram(stats->total_visible_vram); + proto_stats->set_usedvisiblevram(stats->used_visible_vram); + proto_stats->set_freevisiblevram(stats->free_visible_vram); + proto_stats->set_totalgtt(stats->total_gtt); + proto_stats->set_usedgtt(stats->used_gtt); + proto_stats->set_freegtt(stats->free_gtt); +} + +// populte GPU voltage proto +static inline void +aga_gpu_voltage_to_proto (GPUVoltage *proto_stats, + const aga_gpu_voltage_t *stats) +{ + proto_stats->set_voltage(stats->voltage); + proto_stats->set_gfxvoltage(stats->gfx_voltage); + proto_stats->set_memoryvoltage(stats->memory_voltage); +} + +// populate GPU XGMI link statistics +static inline void +aga_gpu_xgmi_link_stats_to_proto (amdgpu::GPUXGMILinkStats *proto_stats, + const aga_gpu_xgmi_link_stats_t *stats) +{ + proto_stats->set_dataread(stats->data_read); + proto_stats->set_datawrite(stats->data_write); +} + +// populate GPU violation statistics +static inline void +aga_gpu_violation_stats_to_proto (amdgpu::GPUViolationStats *proto_stats, + const aga_gpu_violation_stats_t *stats) +{ + proto_stats->set_currentaccumulatedcounter( + stats->current_accumulated_counter); + proto_stats->set_processorhotresidencyaccumulated( + stats->processor_hot_residency_accumulated); + proto_stats->set_pptresidencyaccumulated( + stats->ppt_residency_accumulated); + proto_stats->set_socketthermalresidencyaccumulated( + stats->socket_thermal_residency_accumulated); + proto_stats->set_vrthermalresidencyaccumulated( + stats->vr_thermal_residency_accumulated); + proto_stats->set_hbmthermalresidencyaccumulated( + stats->hbm_thermal_residency_accumulated); +} + +// populate proto buf stats from gpu stats +static inline void +aga_gpu_api_stats_to_proto (GPUStats *proto_stats, + const aga_gpu_stats_t *stats) +{ + proto_stats->set_packagepower(stats->package_power); + proto_stats->set_avgpackagepower(stats->avg_package_power); + aga_gpu_temp_stats_to_proto(proto_stats->mutable_temperature(), + &stats->temperature); + aga_gpu_usage_stats_to_proto(proto_stats->mutable_usage(), + &stats->usage); + aga_gpu_voltage_to_proto(proto_stats->mutable_voltage(), + &stats->voltage); + aga_gpu_pcie_stats_to_proto(proto_stats->mutable_pciestats(), + &stats->pcie_stats); + aga_gpu_vram_usage_stats_to_proto(proto_stats->mutable_vramusage(), + &stats->vram_usage); + proto_stats->set_energyconsumed(stats->energy_consumed); + proto_stats->set_powerusage(stats->power_usage); + proto_stats->set_totalcorrectableerrors(stats->total_correctable_errors); + proto_stats->set_totaluncorrectableerrors( + stats->total_uncorrectable_errors); + proto_stats->set_sdmacorrectableerrors(stats->sdma_correctable_errors); + proto_stats->set_sdmauncorrectableerrors(stats->sdma_uncorrectable_errors); + proto_stats->set_gfxcorrectableerrors(stats->gfx_correctable_errors); + proto_stats->set_gfxuncorrectableerrors(stats->gfx_uncorrectable_errors); + proto_stats->set_mmhubcorrectableerrors(stats->mmhub_correctable_errors); + proto_stats->set_mmhubuncorrectableerrors( + stats->mmhub_uncorrectable_errors); + proto_stats->set_athubcorrectableerrors(stats->athub_correctable_errors); + proto_stats->set_athubuncorrectableerrors( + stats->athub_uncorrectable_errors); + proto_stats->set_bifcorrectableerrors(stats->bif_correctable_errors); + proto_stats->set_bifuncorrectableerrors(stats->bif_uncorrectable_errors); + proto_stats->set_hdpcorrectableerrors(stats->hdp_correctable_errors); + proto_stats->set_hdpuncorrectableerrors(stats->hdp_uncorrectable_errors); + proto_stats->set_xgmiwaflcorrectableerrors( + stats->xgmi_wafl_correctable_errors); + proto_stats->set_xgmiwafluncorrectableerrors( + stats->xgmi_wafl_uncorrectable_errors); + proto_stats->set_dfcorrectableerrors(stats->df_correctable_errors); + proto_stats->set_dfuncorrectableerrors(stats->df_uncorrectable_errors); + proto_stats->set_smncorrectableerrors(stats->smn_correctable_errors); + proto_stats->set_smnuncorrectableerrors(stats->smn_uncorrectable_errors); + proto_stats->set_semcorrectableerrors(stats->sem_correctable_errors); + proto_stats->set_semuncorrectableerrors(stats->sem_uncorrectable_errors); + proto_stats->set_mp0correctableerrors(stats->mp0_correctable_errors); + proto_stats->set_mp0uncorrectableerrors(stats->mp0_uncorrectable_errors); + proto_stats->set_mp1correctableerrors(stats->mp1_correctable_errors); + proto_stats->set_mp1uncorrectableerrors(stats->mp1_uncorrectable_errors); + proto_stats->set_fusecorrectableerrors(stats->fuse_correctable_errors); + proto_stats->set_fuseuncorrectableerrors(stats->fuse_uncorrectable_errors); + proto_stats->set_umccorrectableerrors(stats->umc_correctable_errors); + proto_stats->set_umcuncorrectableerrors(stats->umc_uncorrectable_errors); + proto_stats->set_mcacorrectableerrors(stats->mca_correctable_errors); + proto_stats->set_mcauncorrectableerrors(stats->mca_uncorrectable_errors); + proto_stats->set_vcncorrectableerrors(stats->vcn_correctable_errors); + proto_stats->set_vcnuncorrectableerrors(stats->vcn_uncorrectable_errors); + proto_stats->set_jpegcorrectableerrors(stats->jpeg_correctable_errors); + proto_stats->set_jpeguncorrectableerrors(stats->jpeg_uncorrectable_errors); + proto_stats->set_ihcorrectableerrors(stats->ih_correctable_errors); + proto_stats->set_ihuncorrectableerrors(stats->ih_uncorrectable_errors); + proto_stats->set_mpiocorrectableerrors(stats->mpio_correctable_errors); + proto_stats->set_mpiouncorrectableerrors(stats->mpio_uncorrectable_errors); + proto_stats->set_xgmineighbor0txnops(stats->xgmi_neighbor0_tx_nops); + proto_stats->set_xgmineighbor0txrequests(stats->xgmi_neighbor0_tx_requests); + proto_stats->set_xgmineighbor0txresponses + (stats->xgmi_neighbor0_tx_responses); + proto_stats->set_xgmineighbor0txbeats(stats->xgmi_neighbor0_tx_beats); + proto_stats->set_xgmineighbor1txnops(stats->xgmi_neighbor1_tx_nops); + proto_stats->set_xgmineighbor1txrequests(stats->xgmi_neighbor1_tx_requests); + proto_stats->set_xgmineighbor1txresponses + (stats->xgmi_neighbor1_tx_responses); + proto_stats->set_xgmineighbor1txbeats(stats->xgmi_neighbor1_tx_beats); + proto_stats->set_xgmineighbor0txthroughput( + stats->xgmi_neighbor0_tx_throughput); + proto_stats->set_xgmineighbor1txthroughput( + stats->xgmi_neighbor1_tx_throughput); + proto_stats->set_xgmineighbor2txthroughput( + stats->xgmi_neighbor2_tx_throughput); + proto_stats->set_xgmineighbor3txthroughput( + stats->xgmi_neighbor3_tx_throughput); + proto_stats->set_xgmineighbor4txthroughput( + stats->xgmi_neighbor4_tx_throughput); + proto_stats->set_xgmineighbor5txthroughput( + stats->xgmi_neighbor5_tx_throughput); + proto_stats->set_fanspeed(stats->fan_speed); + proto_stats->set_gfxactivityaccumulated(stats->gfx_activity_accumulated); + proto_stats->set_memoryactivityaccumulated(stats->mem_activity_accumulated); + for (uint32_t i = 0; i < AGA_GPU_MAX_XGMI_LINKS; i++) { + aga_gpu_xgmi_link_stats_to_proto(proto_stats->add_xgmilinkstats(), + &stats->xgmi_link_stats[i]); + } + aga_gpu_violation_stats_to_proto(proto_stats->mutable_violationstats(), + &stats->violation_stats); +} + +// populate proto buf from gpu info +static inline void +aga_gpu_api_info_to_proto (aga_gpu_info_t *info, void *ctxt) +{ + GPUGetResponse *proto_rsp = (GPUGetResponse *)ctxt; + auto gpu = proto_rsp->add_response(); + GPUSpec *proto_spec = gpu->mutable_spec(); + GPUStatus *proto_status = gpu->mutable_status(); + GPUStats *proto_stats = gpu->mutable_stats(); + + aga_gpu_api_spec_to_proto(proto_spec, &info->spec); + aga_gpu_api_status_to_proto(proto_status, &info->status); + aga_gpu_api_stats_to_proto(proto_stats, &info->stats); +} + +// convert aga cper severity to proto +static inline amdgpu::CPERSeverity +aga_cper_severity_to_proto (aga_cper_severity_t severity) +{ + switch (severity) { + case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED: + return amdgpu::CPER_SEVERITY_NON_FATAL_UNCORRECTED; + break; + case AGA_CPER_SEVERITY_FATAL: + return amdgpu::CPER_SEVERITY_FATAL; + break; + case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED: + return amdgpu::CPER_SEVERITY_NON_FATAL_CORRECTED; + break; + default: + break; + } + return amdgpu::CPER_SEVERITY_NONE; +} + +// convert aga cper notification type to proto +static inline amdgpu::CPERNotificationType +aga_cper_notification_type_to_proto (aga_cper_notification_type_t ntfn_type) +{ + switch (ntfn_type) { + case AGA_CPER_NOTIFICATION_TYPE_CMC: + return amdgpu::CPER_NOTIFICATION_TYPE_CMC; + break; + case AGA_CPER_NOTIFICATION_TYPE_CPE: + return amdgpu::CPER_NOTIFICATION_TYPE_CPE; + break; + case AGA_CPER_NOTIFICATION_TYPE_MCE: + return amdgpu::CPER_NOTIFICATION_TYPE_MCE; + break; + case AGA_CPER_NOTIFICATION_TYPE_PCIE: + return amdgpu::CPER_NOTIFICATION_TYPE_PCIE; + break; + case AGA_CPER_NOTIFICATION_TYPE_INIT: + return amdgpu::CPER_NOTIFICATION_TYPE_INIT; + break; + case AGA_CPER_NOTIFICATION_TYPE_NMI: + return amdgpu::CPER_NOTIFICATION_TYPE_NMI; + break; + case AGA_CPER_NOTIFICATION_TYPE_BOOT: + return amdgpu::CPER_NOTIFICATION_TYPE_BOOT; + break; + case AGA_CPER_NOTIFICATION_TYPE_DMAR: + return amdgpu::CPER_NOTIFICATION_TYPE_DMAR; + break; + case AGA_CPER_NOTIFICATION_TYPE_SEA: + return amdgpu::CPER_NOTIFICATION_TYPE_SEA; + break; + case AGA_CPER_NOTIFICATION_TYPE_SEI: + return amdgpu::CPER_NOTIFICATION_TYPE_SEI; + break; + case AGA_CPER_NOTIFICATION_TYPE_PEI: + return amdgpu::CPER_NOTIFICATION_TYPE_PEI; + break; + case AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT: + return amdgpu::CPER_NOTIFICATION_TYPE_CXL_COMPONENT; + break; + default: + break; + } + return amdgpu::CPER_NOTIFICATION_TYPE_NONE; +} + +// populate gpu cper information proto buf +static inline void +aga_gpu_cper_api_info_to_proto (aga_cper_info_t *info, + void *ctxt) +{ + GPUCPEREntry *cper; + GPUCPERGetResponse *proto_rsp = (GPUCPERGetResponse *)ctxt; + + if (!info->num_cper_entry) { + return; + } + cper = proto_rsp->add_cper(); + cper->set_gpu(info->gpu.id, OBJ_MAX_KEY_LEN); + for (uint32_t i = 0; i < info->num_cper_entry; i++) { + auto cper_entry = cper->add_cperentry(); + cper_entry->set_recordid(info->cper_entry[i].record_id); + cper_entry->set_severity( + aga_cper_severity_to_proto(info->cper_entry[i].severity)); + cper_entry->set_revision(info->cper_entry[i].revision); + cper_entry->set_timestamp(info->cper_entry[i].timestamp); + cper_entry->set_creatorid(info->cper_entry[i].creator_id); + cper_entry->set_notificationtype( + aga_cper_notification_type_to_proto( + info->cper_entry[i].notification_type)); + for (uint32_t j = 0; j < info->cper_entry[i].num_af_id; j++) { + cper_entry->add_afid(info->cper_entry[i].af_id[j]); + } + } +} + +#endif // __AGA_SVC_GPU_TO_PROTO_HPP__