diff --git a/sw/nic/gpuagent/api/include/aga_gpu.hpp b/sw/nic/gpuagent/api/include/aga_gpu.hpp
index 3af1b34..187359b 100644
--- a/sw/nic/gpuagent/api/include/aga_gpu.hpp
+++ b/sw/nic/gpuagent/api/include/aga_gpu.hpp
@@ -576,6 +576,32 @@ typedef struct aga_gpu_violation_stats_s {
     uint64_t vr_thermal_residency_accumulated;
     /// High Bandwidth Memory (HBM) thermal residency accumulated
     uint64_t hbm_thermal_residency_accumulated;
+    /// processor hot residency percentage
+    uint64_t processor_hot_residency_percentage;
+    /// Package Power Tracking (PPT) residency percentage
+    uint64_t ppt_residency_percentage;
+    /// socket thermal residency percentage
+    uint64_t socket_thermal_residency_percentage;
+    /// Voltage Rail (VR) thermal residency percentage
+    uint64_t vr_thermal_residency_percentage;
+    /// High Bandwidth Memory (HBM) thermal residency percentage
+    uint64_t hbm_thermal_residency_percentage;
+    /// gfx clock below host limit power accumulated per XCC
+    uint64_t gfx_clk_below_host_limit_power_accumulated[AGA_GPU_MAX_XCC];
+    /// gfx clock below host limit thermal accumulated per XCC
+    uint64_t gfx_clk_below_host_limit_thermal_accumulated[AGA_GPU_MAX_XCC];
+    /// gfx low utilization accumulated per XCC
+    uint64_t gfx_low_utilization_accumulated[AGA_GPU_MAX_XCC];
+    /// gfx clock below host limit total accumulated per XCC
+    uint64_t gfx_clk_below_host_limit_total_accumulated[AGA_GPU_MAX_XCC];
+    /// gfx clock below host limit power percentage per XCC
+    uint64_t gfx_clk_below_host_limit_power_percentage[AGA_GPU_MAX_XCC];
+    /// gfx clock below host limit thermal percentage per XCC
+    uint64_t gfx_clk_below_host_limit_thermal_percentage[AGA_GPU_MAX_XCC];
+    /// gfx low utilization percentage per XCC
+    uint64_t gfx_low_utilization_percentage[AGA_GPU_MAX_XCC];
+    /// gfx clock below host limit total percentage per XCC
+    uint64_t gfx_clk_below_host_limit_total_percentage[AGA_GPU_MAX_XCC];
 } aga_gpu_violation_stats_t;
 
 /// \brief GPU statistics
diff --git a/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig b/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig
new file mode 100644
index 0000000..3af1b34
--- /dev/null
+++ b/sw/nic/gpuagent/api/include/aga_gpu.hpp.orig
@@ -0,0 +1,987 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+//----------------------------------------------------------------------------
+///
+/// \file
+/// GPU spec, status, stats and APIs
+///
+//----------------------------------------------------------------------------
+
+#ifndef __API_INCLUDE_AGA_GPU_HPP__
+#define __API_INCLUDE_AGA_GPU_HPP__
+
+#include "nic/sdk/include/sdk/base.hpp"
+#include "nic/gpuagent/api/include/base.hpp"
+#include "nic/gpuagent/api/smi/smi.hpp"
+
+#define AGA_GPU_MAX_CLOCK_FREQUENCY            6
+#define AGA_GPU_MAX_HBM                        4
+#define AGA_GPU_MAX_FIRMWARE_VERSION           85
+#define AGA_GPU_MAX_KFD_PID                    8
+#define AGA_GPU_MAX_VOLTAGE_CURVE_POINT        4
+#define AGA_GPU_MIN_OVERDRIVE_LEVEL            0
+#define AGA_GPU_MAX_OVERDRIVE_LEVEL            20
+#define AGA_MAX_PEER_DEVICE                    16
+#define AGA_GPU_GFX_MAX_CLOCK                  8
+#define AGA_GPU_MEM_MAX_CLOCK                  1
+#define AGA_GPU_VIDEO_MAX_CLOCK                4
+#define AGA_GPU_DATA_MAX_CLOCK                 4
+#define AGA_GPU_MAX_VCN                        4
+#define AGA_GPU_MAX_JPEG                       32
+#define AGA_GPU_MAX_JPEG_ENG                   40
+#define AGA_GPU_MAX_XCC                        8
+#define AGA_GPU_MAX_XGMI_LINKS                 8
+#define AGA_GPU_MAX_BAD_PAGE_RECORD            64
+#define AGA_GPU_INVALID_PARTITION_ID           0xFFFFFFFF
+#define AGA_GPU_MAX_PARTITION                  8
+#define AGA_GPU_MAX_CPER_ENTRY                 128
+#define AGA_GPU_MAX_AF_ID_PER_CPER             12
+
+/// number of clocks that can not be configured - AGA_GPU_CLOCK_TYPE_FABRIC,
+/// AGA_GPU_CLOCK_TYPE_SOC (4), AGA_GPU_CLOCK_TYPE_DCE, AGA_GPU_CLOCK_TYPE_PCIE
+#define AGA_GPU_NUM_NON_CFG_CLOCK_TYPES        7
+/// clock frequency range are per clock type; as of now it is only set for
+/// clocks of type AGA_GPU_CLOCK_TYPE_SYSTEM, AGA_GPU_CLOCK_TYPE_MEMORY,
+/// AGA_GPU_CLOCK_TYPE_VIDEO and AGA_GPU_CLOCK_TYPE_DATA
+#define AGA_GPU_NUM_CFG_CLOCK_TYPES            4
+/// total number of clocks; for non configurable clocks we assume 1 of each type
+#define AGA_GPU_MAX_CLOCK                               \
+    (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK +    \
+     AGA_GPU_VIDEO_MAX_CLOCK + AGA_GPU_DATA_MAX_CLOCK + \
+     AGA_GPU_NUM_NON_CFG_CLOCK_TYPES)
+
+/// \brief GPU admin state
+typedef enum aga_gpu_admin_state_e {
+    AGA_GPU_ADMIN_STATE_NONE = 0,
+    /// admin UP
+    AGA_GPU_ADMIN_STATE_UP   = 1,
+    /// admin DOWN
+    AGA_GPU_ADMIN_STATE_DOWN = 2,
+} aga_gpu_admin_state_t;
+
+/// \brief GPU clock types
+typedef enum aga_gpu_clock_type_e {
+    AGA_GPU_CLOCK_TYPE_NONE   = 0,
+    /// fabric clock (aka. fclk)
+    AGA_GPU_CLOCK_TYPE_FABRIC = 1,
+    /// memory clock (aka. mclk)
+    AGA_GPU_CLOCK_TYPE_MEMORY = 2,
+    /// system clock (aka. sclk)
+    AGA_GPU_CLOCK_TYPE_SYSTEM = 3,
+    /// SoC clock (aka. socclk)
+    AGA_GPU_CLOCK_TYPE_SOC    = 4,
+    /// Display Controller Engine (DCE) clock
+    AGA_GPU_CLOCK_TYPE_DCE    = 5,
+    /// PCIe clock
+    AGA_GPU_CLOCK_TYPE_PCIE   = 6,
+    /// video clock
+    AGA_GPU_CLOCK_TYPE_VIDEO  = 7,
+    /// data clock
+    AGA_GPU_CLOCK_TYPE_DATA   = 8,
+    AGA_GPU_CLOCK_TYPE_MAX    = 9,
+} aga_gpu_clock_type_t;
+
+/// \brief GPU performance levels
+typedef enum aga_gpu_perf_level_e {
+    AGA_GPU_PERF_LEVEL_NONE             = 0,
+    /// performance level auto
+    AGA_GPU_PERF_LEVEL_AUTO             = 1,
+    /// performance level low
+    AGA_GPU_PERF_LEVEL_LOW              = 2,
+    /// performance level high
+    AGA_GPU_PERF_LEVEL_HIGH             = 3,
+    /// minimum performance variation
+    AGA_GPU_PERF_LEVEL_DETERMINISTIC    = 4,
+    /// stable power state with minimum memory clock
+    AGA_GPU_PERF_LEVEL_STABLE_WITH_MCLK = 5,
+    /// stable power state with minimum system clock
+    AGA_GPU_PERF_LEVEL_STABLE_WITH_SCLK = 6,
+    /// manual performance level
+    AGA_GPU_PERF_LEVEL_MANUAL           = 7,
+} aga_gpu_perf_level_t;
+
+/// \brief GPU throttling status
+typedef enum aga_gpu_throttling_status_e {
+    AGA_GPU_THROTTLING_STATUS_NONE = 0,
+    /// throttling off
+    AGA_GPU_THROTTLING_STATUS_OFF  = 1,
+    /// throttling on
+    AGA_GPU_THROTTLING_STATUS_ON   = 2,
+} aga_gpu_throttling_status_t;
+
+/// \brief GPU clock frequency range
+/// NOTE:
+/// values are in MHz
+typedef struct aga_gpu_clock_freq_range_s {
+    /// clock type
+    aga_gpu_clock_type_t clock_type;
+    /// low frequency value
+    uint32_t lo;
+    /// high frequency value
+    uint32_t hi;
+} aga_gpu_clock_freq_range_t;
+
+/// \brief  RAS configuration per block
+typedef struct aga_gpu_ras_spec_s {
+    // TODO:
+    // fill this
+} aga_gpu_ras_spec_t;
+
+/// GPU compute partition type
+typedef enum aga_gpu_compute_partition_type_e {
+    /// unknown/invalid partition type
+    AGA_GPU_COMPUTE_PARTITION_TYPE_NONE = 0,
+    /// single-partition GPU mode
+    AGA_GPU_COMPUTE_PARTITION_TYPE_SPX  = 1,
+    /// dual-partition GPU mode
+    AGA_GPU_COMPUTE_PARTITION_TYPE_DPX  = 2,
+    /// triple-partition GPU mode
+    AGA_GPU_COMPUTE_PARTITION_TYPE_TPX  = 3,
+    /// quad-partition GPU mode
+    AGA_GPU_COMPUTE_PARTITION_TYPE_QPX  = 4,
+    /// core-partition GPU mode
+    AGA_GPU_COMPUTE_PARTITION_TYPE_CPX  = 5,
+} aga_gpu_compute_partition_type_t;
+
+/// GPU memory partition type
+typedef enum aga_gpu_memory_partition_type_e {
+    /// unknown/invalid partition type
+    AGA_GPU_MEMORY_PARTITION_TYPE_NONE = 0,
+    /// one NUMA per socket
+    AGA_GPU_MEMORY_PARTITION_TYPE_NPS1 = 1,
+    /// two NUMA per socket
+    AGA_GPU_MEMORY_PARTITION_TYPE_NPS2 = 2,
+    /// four NUMA per socket
+    AGA_GPU_MEMORY_PARTITION_TYPE_NPS4 = 3,
+    /// eight NUMA per socket
+    AGA_GPU_MEMORY_PARTITION_TYPE_NPS8 = 4,
+} aga_gpu_memory_partition_type_t;
+
+/// \brief GPU specification
+typedef struct aga_gpu_spec_s {
+    /// uuid of gpu
+    /// NOTE: in case of partition, all child GPUs share the UUID of the parent
+    ///       GPU; to differentiate between them  we encode the partition ID in
+    ///       the 4 bytes starting from byte 4 of the parent GPU UUID;
+    ///       example : parent GPU - 1eff74a1-0000-1000-807e-1746627a9cd7
+    ///                 child GPUs - 1eff74a1-0000-0000-807e-1746627a9cd7
+    ///                              1eff74a1-0000-0001-807e-1746627a9cd7
+    ///                              1eff74a1-0000-0002-807e-1746627a9cd7
+    ///                              1eff74a1-0000-0003-807e-1746627a9cd7
+    aga_obj_key_t key;
+    /// uuid of parent gpu in case of partitioning
+    aga_obj_key_t parent_gpu;
+    /// admin state
+    aga_gpu_admin_state_t admin_state;
+    /// GPU clock overdrive level (as percentage)
+    uint32_t overdrive_level;
+    /// max GPU power in power overdrive (in Watts)
+    uint64_t gpu_power_cap;
+    /// GPU performance level
+    aga_gpu_perf_level_t perf_level;
+    /// number of clock frequencies
+    uint32_t num_clock_freqs;
+    /// clock frequency ranges
+    aga_gpu_clock_freq_range_t clock_freq[AGA_GPU_NUM_CFG_CLOCK_TYPES];
+    /// fan speed
+    uint64_t fan_speed;
+    /// GPU RAS configuration
+    aga_gpu_ras_spec_t ras_spec;
+    /// GPU compute partition type
+    aga_gpu_compute_partition_type_t compute_partition_type;
+    /// GPU memory partition type
+    aga_gpu_memory_partition_type_t memory_partition_type;
+} aga_gpu_spec_t;
+
+/// \brief GPU operational state
+typedef enum aga_gpu_oper_state_e {
+    AGA_GPU_OPER_STATE_NONE = 0,
+    /// operationally up
+    AGA_GPU_OPER_STATE_UP   = 1,
+    /// operationall down
+    AGA_GPU_OPER_STATE_DOWN = 2,
+} aga_gpu_oper_state_t;
+
+/// GPU temperature sensor type
+typedef enum aga_gpu_temp_sensor_type_e {
+    AGA_GPU_TEMP_TYPE_EDGE     = 0,
+    AGA_GPU_TEMP_TYPE_JUNCTION = 1,
+    AGA_GPU_TEMP_TYPE_MEMORY   = 2,
+    AGA_GPU_TEMP_TYPE_HBM_0    = 3,
+    AGA_GPU_TEMP_TYPE_HBM_1    = 4,
+    AGA_GPU_TEMP_TYPE_HBM_2    = 5,
+    AGA_GPU_TEMP_TYPE_HBM_3    = 6,
+    AGA_GPU_TEMP_TYPE_INVALID  = 7,
+} aga_gpu_temp_sensor_type_t;
+
+/// \brief GPU temperature information
+/// NOTE:
+/// all temperatures are in centigrade
+typedef struct aga_gpu_temperature_stats_s {
+    /// edge temperature
+    float edge_temperature;
+    /// junction temperature
+    float junction_temperature;
+    /// memory temperature
+    float memory_temperature;
+    /// HBM temperatures
+    float hbm_temperature[AGA_GPU_MAX_HBM];
+} aga_gpu_temperature_stats_t;
+
+/// \brief GPU usage status as percentage of time GPU is busy
+typedef struct aga_gpu_usage_s {
+    uint32_t gfx_activity;
+    uint32_t umc_activity;
+    uint32_t mm_activity;
+    uint16_t vcn_activity[AGA_GPU_MAX_VCN];
+    uint16_t jpeg_activity[AGA_GPU_MAX_JPEG];
+    uint32_t gfx_busy_inst[AGA_GPU_MAX_XCC];
+    uint16_t jpeg_busy[AGA_GPU_MAX_JPEG_ENG];
+    uint16_t vcn_busy[AGA_GPU_MAX_VCN];
+} aga_gpu_usage_t;
+
+/// \brief GPU current memory usage
+typedef struct aga_gpu_memory_usage_s {
+    /// percentage of available memory in use
+    float memory_usage;
+    float activity;
+} aga_gpu_memory_usage_t;
+
+/// \brief GPU clock information
+typedef struct aga_gpu_clock_status_s {
+    /// clock type
+    aga_gpu_clock_type_t clock_type;
+    /// clock frequency in MHz
+    uint32_t frequency;
+    /// low frequency value
+    uint32_t low_frequency;
+    /// high frequency value
+    uint32_t high_frequency;
+    /// clock is locked or not
+    bool locked;
+    /// clock is in deep sleep or not
+    bool deep_sleep;
+} aga_gpu_clock_status_t;
+
+/// \brief GPU voltage curve point
+typedef struct aga_gpu_voltage_curve_point_s {
+    /// curve point
+    uint32_t point;
+    /// frequency in MHz
+    uint32_t frequency;
+    /// voltage
+    uint32_t voltage;
+} aga_gpu_voltage_curve_point_t;
+
+/// \brief GPU reliability, availability & servicability status
+typedef struct aga_gpu_ras_status_s {
+    // TODO:
+    // fill this
+} aga_gpu_ras_status_t;
+
+
+/// \brief GPU xgmi error status
+typedef enum aga_gpu_xgmi_error_status_e {
+    AGA_GPU_XGMI_STATUS_NONE           = 0,
+    /// no errors since last read
+    AGA_GPU_XGMI_STATUS_NO_ERROR       = 1,
+    /// one error detected since last read
+    AGA_GPU_XGMI_STATUS_ONE_ERROR      = 2,
+    /// multiple errors detected since last read
+    AGA_GPU_XGMI_STATUS_MULTIPLE_ERROR = 3,
+} aga_gpu_xgmi_error_status_t;
+
+/// \brief GPU XGMI status
+typedef struct aga_gpu_xgmi_status_s {
+    /// XGMI error status
+    aga_gpu_xgmi_error_status_t error_status;
+    /// XGMI link width in GB/s
+    uint64_t width;
+    /// XGMI link speed in GB/s
+    uint64_t speed;
+} aga_gpu_xgmi_status_t;
+
+/// \brief GPU firmware version
+typedef struct aga_gpu_fw_version_s {
+    /// name of the component
+    char firmware[AGA_MAX_STR_LEN + 1];
+    /// firmware version of component
+    char version[AGA_MAX_STR_LEN + 1];
+} aga_gpu_fw_version_t;
+
+/// \brief PCIe slot type
+typedef enum aga_pcie_slot_type_e {
+    AGA_PCIE_SLOT_TYPE_NONE    = 0,
+    AGA_PCIE_SLOT_TYPE_PCIE    = 1,
+    AGA_PCIE_SLOT_TYPE_OAM     = 2,
+    AGA_PCIE_SLOT_TYPE_CEM     = 3,
+    AGA_PCIE_SLOT_TYPE_UNKNOWN = 4,
+} aga_pcie_slot_type_t;
+
+/// \brief PCIe status
+typedef struct aga_gpu_pcie_status_s {
+    /// PCIe card form factor
+    aga_pcie_slot_type_t slot_type;
+    /// pcie bus id
+    char pcie_bus_id[AGA_MAX_STR_LEN + 1];
+    /// maximum number of PCIe lanes
+    uint32_t max_width;
+    /// maximum PCIe speed
+    uint32_t max_speed;
+    /// PCIe interface version
+    uint32_t version;
+    /// current PCIe lanes
+    uint32_t width;
+    /// current PCIe speed (in GT/s)
+    uint32_t speed;
+    /// current PCIe bandwidth (in MB/s)
+    uint64_t bandwidth;
+} aga_gpu_pcie_status_t;
+
+/// \brief VRAM type
+typedef enum aga_vram_type_e {
+    AGA_VRAM_TYPE_NONE    = 0,
+    AGA_VRAM_TYPE_HBM     = 1,
+    AGA_VRAM_TYPE_HBM2    = 2,
+    AGA_VRAM_TYPE_HBM2E   = 3,
+    AGA_VRAM_TYPE_HBM3    = 4,
+    AGA_VRAM_TYPE_DDR2    = 5,
+    AGA_VRAM_TYPE_DDR3    = 6,
+    AGA_VRAM_TYPE_DDR4    = 7,
+    AGA_VRAM_TYPE_GDDR1   = 8,
+    AGA_VRAM_TYPE_GDDR2   = 9,
+    AGA_VRAM_TYPE_GDDR3   = 10,
+    AGA_VRAM_TYPE_GDDR4   = 11,
+    AGA_VRAM_TYPE_GDDR5   = 12,
+    AGA_VRAM_TYPE_GDDR6   = 13,
+    AGA_VRAM_TYPE_GDDR7   = 14,
+    AGA_VRAM_TYPE_UNKNOWN = 15,
+} aga_vram_type_t;
+
+/// \brief VRAM vendor
+typedef enum aga_vram_vendor_e {
+    AGA_VRAM_VENDOR_NONE     = 0,
+    AGA_VRAM_VENDOR_SAMSUNG  = 1,
+    AGA_VRAM_VENDOR_INFINEON = 2,
+    AGA_VRAM_VENDOR_ELPIDA   = 3,
+    AGA_VRAM_VENDOR_ETRON    = 4,
+    AGA_VRAM_VENDOR_NANYA    = 5,
+    AGA_VRAM_VENDOR_HYNIX    = 6,
+    AGA_VRAM_VENDOR_MOSEL    = 7,
+    AGA_VRAM_VENDOR_WINBOND  = 8,
+    AGA_VRAM_VENDOR_ESMT     = 9,
+    AGA_VRAM_VENDOR_MICRON   = 10,
+    AGA_VRAM_VENDOR_UNKNOWN  = 11,
+} aga_vram_vendor_t;
+
+/// \brief VRAM status
+typedef struct aga_gpu_vram_status_s {
+    /// VRAM type
+    aga_vram_type_t type;
+    /// VRAM vendor
+    char vendor[AGA_MAX_STR_LEN + 1];
+    /// VRAM size (in MB)
+    uint64_t size;
+} aga_gpu_vram_status_t;
+
+/// \brief GPU page status
+typedef enum aga_gpu_page_status_e {
+    AGA_GPU_PAGE_STATUS_NONE         = 0,
+    AGA_GPU_PAGE_STATUS_RESERVED     = 1,
+    AGA_GPU_PAGE_STATUS_PENDING      = 2,
+    AGA_GPU_PAGE_STATUS_UNRESERVABLE = 3,
+} aga_gpu_page_status_t;
+
+/// \brief GPU bad page record
+typedef struct aga_gpu_bad_page_record_s {
+    /// GPU key
+    aga_obj_key_t key;
+    /// page start address
+    uint64_t page_address;
+    /// page size
+    uint64_t page_size;
+    /// page status
+    aga_gpu_page_status_t page_status;
+} aga_gpu_bad_page_record_t;
+
+/// \brief GPU virtualization mode
+typedef enum aga_gpu_virtualization_mode_e {
+    AGA_VIRTUALIZATION_MODE_NONE        = 0,
+    AGA_VIRTUALIZATION_MODE_UNKNOWN     = AGA_VIRTUALIZATION_MODE_NONE,
+    AGA_VIRTUALIZATION_MODE_BAREMETAL   = 1,
+    AGA_VIRTUALIZATION_MODE_HOST        = 2,
+    AGA_VIRTUALIZATION_MODE_GUEST       = 3,
+    AGA_VIRTUALIZATION_MODE_PASSTHROUGH = 4,
+} aga_gpu_virtualization_mode_t;
+
+/// \brief operational information of a physical GPU
+typedef struct aga_gpu_status_s {
+    /// assigned GPU index local to compute node
+    uint32_t index;
+    /// handle of gpu
+    aga_gpu_handle_t handle;
+    /// serial number
+    char serial_num[AGA_MAX_STR_LEN + 1];
+    /// GPU product series
+    char card_series[AGA_MAX_STR_LEN + 1];
+    /// GPU model
+    char card_model[AGA_MAX_STR_LEN + 1];
+    /// GPU vendor
+    char card_vendor[AGA_MAX_STR_LEN + 1];
+    /// GPU sku
+    char card_sku[AGA_MAX_STR_LEN + 1];
+    /// driver version
+    char driver_version[AGA_MAX_STR_LEN + 1];
+    /// vbios part number
+    char vbios_part_number[AGA_MAX_STR_LEN + 1];
+    /// vbios version
+    char vbios_version[AGA_MAX_STR_LEN + 1];
+    /// number of fw versions
+    uint32_t num_fw_versions;
+    /// firmware versions of various components
+    aga_gpu_fw_version_t fw_version[AGA_GPU_MAX_FIRMWARE_VERSION];
+    /// memory component vendor
+    char memory_vendor[AGA_MAX_STR_LEN + 1];
+    /// operational status of the device
+    aga_gpu_oper_state_t oper_status;
+    /// number of clock status
+    uint32_t num_clock_status;
+    /// GPU clock status
+    aga_gpu_clock_status_t clock_status[AGA_GPU_MAX_CLOCK];
+    /// number of Kenral Fusion Driver process ids using the GPU
+    uint32_t num_kfd_process_id;
+    /// Kernel Fusion Driver (KFD) process ids using the GPU
+    uint32_t kfd_process_id[AGA_GPU_MAX_KFD_PID];
+    /// GPU RAS status
+    aga_gpu_ras_status_t ras_status;
+    /// xgmi status
+    aga_gpu_xgmi_status_t xgmi_status;
+    /// PCIe status
+    aga_gpu_pcie_status_t pcie_status;
+    /// VRAM status
+    aga_gpu_vram_status_t vram_status;
+    /// voltage curve points
+    aga_gpu_voltage_curve_point_t voltage_curve_point[AGA_GPU_MAX_VOLTAGE_CURVE_POINT];
+    /// GPU throttling status
+    aga_gpu_throttling_status_t throttling_status;
+    /// firmware timestamp
+    uint64_t fw_timestamp;
+    /// GPU partition id
+    uint32_t partition_id;
+    /// GPU partitions (aka. child GPUs)
+    /// NOTE:
+    /// only valid for physical GPUs which have been partitioned
+    uint32_t num_gpu_partition;
+    aga_obj_key_t gpu_partition[AGA_GPU_MAX_PARTITION];
+    /// physical GPU (aka. parent GPU)
+    /// NOTE:
+    /// only valid for GPU partitions (child GPUs)
+    aga_obj_key_t physical_gpu;
+    // GPU KFD id
+    uint64_t kfd_id;
+    // GPU node id
+    uint32_t node_id;
+    // GPU driver DRM render id
+    uint32_t drm_render_id;
+    // GPU driver DRM card id
+    uint32_t drm_card_id;
+    // GPU virtualization mode
+    aga_gpu_virtualization_mode_t virtualization_mode;
+} aga_gpu_status_t;
+
+/// \brief GPU PCIe statistics
+typedef struct aga_gpu_pcie_stats_s {
+    /// total number of the replays issued on the PCIe link
+    uint64_t replay_count;
+    /// total number of times PCIe link transitioned from L0 to recovery state
+    uint64_t recovery_count;
+    /// total number of replay rollovers issued on the PCIe link
+    uint64_t replay_rollover_count;
+    /// total number of NACKs issued on the PCIe link by the device
+    uint64_t nack_sent_count;
+    /// total number of NACKs issued on the PCIe link by the receiver
+    uint64_t nack_received_count;
+    /// accumulated bytes received from the PCIe link
+    uint64_t rx_bytes;
+    /// accumulated bytes transmitted to the PCIe link
+    uint64_t tx_bytes;
+    /// accumulated combined bandwidth on PCIe link (GB/sec)
+    uint64_t bidir_bandwidth;
+} aga_gpu_pcie_stats_t;
+
+/// \brief GPU voltage statistics
+typedef struct aga_gpu_voltage_s {
+    /// current voltage (in mV)
+    uint64_t voltage;
+    /// current graphics voltage (in mV)
+    uint64_t gfx_voltage;
+    /// current memory voltage (in mV)
+    uint64_t memory_voltage;
+} aga_gpu_voltage_t;
+
+/// \brief GPU VRAM usage statistics
+typedef struct aga_gpu_vram_usage_s {
+    /// total VRAM (in MB)
+    uint64_t total_vram;
+    /// used VRAM (in MB)
+    uint64_t used_vram;
+    /// free VRAM (in MB)
+    uint64_t free_vram;
+    /// total visible VRAM (in MB)
+    uint64_t total_visible_vram;
+    /// used visible VRAM (in MB)
+    uint64_t used_visible_vram;
+    /// free visible VRAM (in MB)
+    uint64_t free_visible_vram;
+    /// total Graphic Translation Table (GTT) (in MB)
+    uint64_t total_gtt;
+    /// used GTT (in MB)
+    uint64_t used_gtt;
+    /// free GTT (in MB)
+    uint64_t free_gtt;
+} aga_gpu_vram_usage_t;
+
+/// \brief GPU XGMI link statistics
+typedef struct aga_gpu_xgmi_link_stats_s {
+    /// data read in KB
+    uint64_t data_read;
+    /// data written in KB
+    uint64_t data_write;
+} aga_gpu_xgmi_link_stats_t;
+
+/// \brief GPU violation statistics
+typedef struct aga_gpu_violation_stats_s {
+    /// current acummulated counter
+    uint64_t current_accumulated_counter;
+    /// processor hot residency accumulated
+    uint64_t processor_hot_residency_accumulated;
+    /// Package Power Tracking (PPT) residency accumulated
+    uint64_t ppt_residency_accumulated;
+    /// socket thermal residency accumulated
+    uint64_t socket_thermal_residency_accumulated;
+    /// Voltage Rail (VR) thermal residency accumulated
+    uint64_t vr_thermal_residency_accumulated;
+    /// High Bandwidth Memory (HBM) thermal residency accumulated
+    uint64_t hbm_thermal_residency_accumulated;
+} aga_gpu_violation_stats_t;
+
+/// \brief GPU statistics
+typedef struct aga_gpu_stats_s {
+    /// current graphics package power (in Watts)
+    uint64_t package_power;
+    /// average package power (in Watts)
+    uint64_t avg_package_power;
+    /// current temperature
+    aga_gpu_temperature_stats_t temperature;
+    /// current GPU usage
+    aga_gpu_usage_t usage;
+    /// current voltage (in mV)
+    aga_gpu_voltage_t voltage;
+    /// GPU PCIe stats
+    aga_gpu_pcie_stats_t pcie_stats;
+    /// GPU VRAM usage stats
+    aga_gpu_vram_usage_t vram_usage;
+    /// accumulated energy consumed (in uJ)
+    double energy_consumed;
+    /// power usage (in Watts)
+    uint32_t power_usage;
+    /// total correctable errors
+    uint64_t total_correctable_errors;
+    /// total uncorrectable errors
+    uint64_t total_uncorrectable_errors;
+    /// SDMA correctable errors
+    uint64_t sdma_correctable_errors;
+    /// SDMA uncorrectable errors
+    uint64_t sdma_uncorrectable_errors;
+    /// GFX correctable errors
+    uint64_t gfx_correctable_errors;
+    /// GFX uncorrectable errors
+    uint64_t gfx_uncorrectable_errors;
+    /// MMHUB correctable errors
+    uint64_t mmhub_correctable_errors;
+    /// MMHUB uncorrectable errors
+    uint64_t mmhub_uncorrectable_errors;
+    /// ATHUB correctable errors
+    uint64_t athub_correctable_errors;
+    /// ATHUB uncorrectable errors
+    uint64_t athub_uncorrectable_errors;
+    /// BIF correctable errors
+    uint64_t bif_correctable_errors;
+    /// BIF uncorrectable errors
+    uint64_t bif_uncorrectable_errors;
+    /// HDP correctable errors
+    uint64_t hdp_correctable_errors;
+    /// HDP uncorrectable errors
+    uint64_t hdp_uncorrectable_errors;
+    /// XGMI WAFL correctable errors
+    uint64_t xgmi_wafl_correctable_errors;
+    /// XGMI WAFL uncorrectable errors
+    uint64_t xgmi_wafl_uncorrectable_errors;
+    /// DF correctable errors
+    uint64_t df_correctable_errors;
+    /// DF uncorrectable errors
+    uint64_t df_uncorrectable_errors;
+    /// SMN correctable errors
+    uint64_t smn_correctable_errors;
+    /// SMN uncorrectable errors
+    uint64_t smn_uncorrectable_errors;
+    /// SEM correctable errors
+    uint64_t sem_correctable_errors;
+    /// SEM uncorrectable errors
+    uint64_t sem_uncorrectable_errors;
+    /// MP0 correctable errors
+    uint64_t mp0_correctable_errors;
+    /// MP0 uncorrectable errors
+    uint64_t mp0_uncorrectable_errors;
+    /// MP1 correctable errors
+    uint64_t mp1_correctable_errors;
+    /// MP1 uncorrectable errors
+    uint64_t mp1_uncorrectable_errors;
+    /// FUSE correctable errors
+    uint64_t fuse_correctable_errors;
+    /// FUSE uncorrectable errors
+    uint64_t fuse_uncorrectable_errors;
+    /// UMC correctable errors
+    uint64_t umc_correctable_errors;
+    /// UMC uncorrectable errors
+    uint64_t umc_uncorrectable_errors;
+    /// MCA correctable errors
+    uint64_t mca_correctable_errors;
+    /// MCA uncorrectable errors
+    uint64_t mca_uncorrectable_errors;
+    /// VCN correctable errors
+    uint64_t vcn_correctable_errors;
+    /// VCN uncorrectable errors
+    uint64_t vcn_uncorrectable_errors;
+    /// JPEG correctable errors
+    uint64_t jpeg_correctable_errors;
+    /// JPEG uncorrectable errors
+    uint64_t jpeg_uncorrectable_errors;
+    /// IH correctable errors
+    uint64_t ih_correctable_errors;
+    /// IH uncorrectable errors
+    uint64_t ih_uncorrectable_errors;
+    /// MPIO correctable errors
+    uint64_t mpio_correctable_errors;
+    /// MPIO uncorrectable errors
+    uint64_t mpio_uncorrectable_errors;
+    /// XGMI counters
+    /// NOPs sent to neighbor0
+    uint64_t xgmi_neighbor0_tx_nops;
+    /// outgoing requests to neighbor0
+    uint64_t xgmi_neighbor0_tx_requests;
+    /// outgoing responses to neighbor0
+    uint64_t xgmi_neighbor0_tx_responses;
+    /// data beats sent to neighbor0 (each beat = 32 Bytes)
+    uint64_t xgmi_neighbor0_tx_beats;
+    /// NOPs sent to neighbor1
+    uint64_t xgmi_neighbor1_tx_nops;
+    /// outgoing requests to neighbor1
+    uint64_t xgmi_neighbor1_tx_requests;
+    /// outgoing responses to neighbor1
+    uint64_t xgmi_neighbor1_tx_responses;
+    /// data beats sent to neighbor1 (each beat = 32 Bytes)
+    uint64_t xgmi_neighbor1_tx_beats;
+    /// transmit throughput to XGMI neighbor 0 (in Bytes per second)
+    uint64_t xgmi_neighbor0_tx_throughput;
+    /// transmit throughput to XGMI neighbor 1 (in Bytes per second)
+    uint64_t xgmi_neighbor1_tx_throughput;
+    /// transmit throughput to XGMI neighbor 2 (in Bytes per second)
+    uint64_t xgmi_neighbor2_tx_throughput;
+    /// transmit throughput to XGMI neighbor 3 (in Bytes per second)
+    uint64_t xgmi_neighbor3_tx_throughput;
+    /// transmit throughput to XGMI neighbor 4 (in Bytes per second)
+    uint64_t xgmi_neighbor4_tx_throughput;
+    /// transmit throughput to XGMI neighbor 5 (in Bytes per second)
+    uint64_t xgmi_neighbor5_tx_throughput;
+    /// fan speed in RPMs
+    uint64_t fan_speed;
+    /// graphics activity accumulated in %
+    uint64_t gfx_activity_accumulated;
+    /// memory activity accumulated in %
+    uint64_t mem_activity_accumulated;
+    /// XGMI link statistics
+    aga_gpu_xgmi_link_stats_t xgmi_link_stats[AGA_GPU_MAX_XGMI_LINKS];
+    /// GPU violation statistics
+    aga_gpu_violation_stats_t violation_stats;
+} aga_gpu_stats_t;
+
+/// GPU info
+typedef struct aga_gpu_info_s {
+    aga_gpu_spec_t spec;
+    aga_gpu_status_t status;
+    aga_gpu_stats_t stats;
+} aga_gpu_info_t;
+
+/// device type enum
+typedef enum aga_device_type_e {
+    AGA_DEVICE_TYPE_NONE = 0,
+    /// GPU device
+    AGA_DEVICE_TYPE_GPU  = 1,
+} aga_device_type_t;
+
+/// IO link type enum
+typedef enum aga_io_link_type_e {
+    AGA_IO_LINK_TYPE_NONE = 0,
+    /// PCIe connection to the device
+    AGA_IO_LINK_TYPE_PCIE = 1,
+    /// XGMI connection to the device
+    AGA_IO_LINK_TYPE_XGMI = 2,
+} aga_io_link_type_t;
+
+typedef struct aga_device_connection_s {
+    /// IO link type of connection
+    aga_io_link_type_t type;
+} aga_device_connection_t;
+
+/// device structure
+typedef struct aga_device_s {
+    /// device type
+    aga_device_type_t type;
+    /// device name
+    char name[AGA_MAX_STR_LEN + 1];
+} aga_device_t;
+
+/// peer device info
+typedef struct aga_peer_device_s {
+    /// peer device valid
+    bool valid;
+    /// peer device
+    aga_device_t peer_device;
+    /// connection details to the peer device
+    aga_device_connection_t connection;
+    /// distance in terms of no. of hops to the peer device
+    uint64_t num_hops;
+    /// weight assigned to the connection to peer device
+    uint64_t link_weight;
+} aga_peer_device_t;
+
+/// device topology info
+typedef struct aga_device_topology_info_s {
+    /// device under consideration
+    aga_device_t device;
+    /// list of peer devices and corresponding inter-connection details
+    aga_peer_device_t peer_device[AGA_MAX_PEER_DEVICE];
+} aga_device_topology_info_t;
+
+/// GPU compute partition info
+typedef struct aga_gpu_compute_partition_info_s {
+    /// physical GPU
+    aga_obj_key_t physical_gpu;
+    /// compute partition type
+    aga_gpu_compute_partition_type_t partition_type;
+    /// GPU partitions (child GPUs)
+    uint32_t num_gpu_partition;
+    aga_obj_key_t gpu_partition[AGA_GPU_MAX_PARTITION];
+} aga_gpu_compute_partition_info_t;
+
+/// GPU memory partition info
+typedef struct aga_gpu_memory_partition_info_s {
+    /// physical GPU
+    aga_obj_key_t physical_gpu;
+    /// memory partition type
+    aga_gpu_memory_partition_type_t partition_type;
+} aga_gpu_memory_partition_info_t;
+
+/// CPER severity
+typedef enum aga_cper_severity_e {
+    /// invalid severity
+    AGA_CPER_SEVERITY_NONE                  = 0,
+    /// non-fatal uncorrected errors
+    AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED = 1,
+    /// fatal errors
+    AGA_CPER_SEVERITY_FATAL                 = 2,
+    /// non-fatal corrected errors
+    AGA_CPER_SEVERITY_NON_FATAL_CORRECTED   = 3,
+} aga_cper_severity_t;
+
+/// CPER notification type
+typedef enum aga_cper_notification_type_e {
+    /// invalid notification type
+    AGA_CPER_NOTIFICATION_TYPE_NONE          = 0,
+    /// Corrected Memory Check (CMC)
+    AGA_CPER_NOTIFICATION_TYPE_CMC           = 1,
+    /// Corrected Platform Error (CPE)
+    AGA_CPER_NOTIFICATION_TYPE_CPE           = 2,
+    /// Machine Check Exception (MCE)
+    AGA_CPER_NOTIFICATION_TYPE_MCE           = 3,
+    /// PCI express error
+    AGA_CPER_NOTIFICATION_TYPE_PCIE          = 4,
+    /// initialization error
+    AGA_CPER_NOTIFICATION_TYPE_INIT          = 5,
+    /// Non-Maskable Interrupt (NMI)
+    AGA_CPER_NOTIFICATION_TYPE_NMI           = 6,
+    /// boot error
+    AGA_CPER_NOTIFICATION_TYPE_BOOT          = 7,
+    /// Direct Memory Access Remapping (DMAR) error
+    AGA_CPER_NOTIFICATION_TYPE_DMAR          = 8,
+    /// System Error Architecture (SEA)
+    AGA_CPER_NOTIFICATION_TYPE_SEA           = 9,
+    /// System Error Interface (SEI)
+    AGA_CPER_NOTIFICATION_TYPE_SEI           = 10,
+    /// Platform Error Interface (PEI)
+    AGA_CPER_NOTIFICATION_TYPE_PEI           = 11,
+    /// Compute Express Link component error
+    AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT = 12,
+} aga_cper_notification_type_t;
+
+/// CPER entry information
+typedef struct aga_cper_entry_s {
+    /// CPER entry identifier
+    std::string record_id;
+    /// CPER error severity
+    aga_cper_severity_t severity;
+    /// CPER format revision
+    uint32_t revision;
+    /// CPER error timestamp
+    std::string timestamp;
+    /// CPER entry creator identifier
+    std::string creator_id;
+    /// CPER entry notification type
+    aga_cper_notification_type_t notification_type;
+    /// number of AMD field ids
+    uint32_t num_af_id;
+    /// AMD field ids
+    uint64_t af_id[AGA_GPU_MAX_AF_ID_PER_CPER];
+} aga_cper_entry_t;
+
+/// CPER information
+typedef struct aga_cper_info_s {
+    /// GPU uuid
+    aga_obj_key_t gpu;
+    /// number of cper entries
+    uint32_t num_cper_entry;
+    /// cper entries
+    aga_cper_entry_t cper_entry[AGA_GPU_MAX_CPER_ENTRY];
+} aga_cper_info_t;
+
+/// \brief     create gpu
+/// \param[in] spec config specification
+/// \return    #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_create(_In_ aga_gpu_spec_t *spec);
+
+/// \brief      read gpu
+/// \param[in]  key  key of the gpu object
+/// \param[out] info information
+/// \return     #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_read(_In_ aga_obj_key_t *key, _Out_ aga_gpu_info_t *info);
+
+typedef void (*gpu_read_cb_t)(aga_gpu_info_t *info, void *ctxt);
+
+/// \brief    read all gpu information
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_read_all(_In_ gpu_read_cb_t gpu_read_cb, _In_ void *ctxt);
+
+/// \brief      function to get compute partition info of a given physical gpu
+///             which has been partitioned
+/// \param[in]  key  key of the physical gpu object which has been partitioned
+/// \param[out] info information
+/// \return     #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_compute_partition_read(_In_ aga_obj_key_t *key,
+              _Out_ aga_gpu_compute_partition_info_t *info);
+
+typedef void (*gpu_compute_partition_read_cb_t)
+    (aga_gpu_compute_partition_info_t *info, void *ctxt);
+
+/// \brief      read compute partition info of all physical gpus which have been
+///             partitioned
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_compute_partition_read_all(
+              _In_ gpu_compute_partition_read_cb_t gpu_read_cb,
+              _In_ void *ctxt);
+
+/// \brief      function to set compute partition type for a GPU
+/// \param[in]  spec  spec of the GPU including information about compute
+///                   partition type to be set
+/// \return     #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_compute_partition_set(_In_ aga_gpu_spec_t *spec);
+
+/// \brief      function to set memory partition type for a GPU
+/// \param[in]  spec  spec of the GPU including information about memory
+///                   partition type to be set
+/// \return     #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_memory_partition_set(_In_ aga_gpu_spec_t *spec);
+
+/// \brief      function to get memory partition info of a given physical gpu
+/// \param[in]  key  key of the physical gpu object
+/// \param[out] info information
+/// \return     #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_memory_partition_read(_In_ aga_obj_key_t *key,
+              _Out_ aga_gpu_memory_partition_info_t *info);
+
+typedef void (*gpu_memory_partition_read_cb_t)
+    (aga_gpu_memory_partition_info_t *info, void *ctxt);
+
+/// \brief      read memory partition info of all physical gpus
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_memory_partition_read_all(
+              _In_ gpu_memory_partition_read_cb_t gpu_read_cb,
+              _In_ void *ctxt);
+
+typedef void (*gpu_bad_page_read_cb_t)(uint32_t num_bad_pages,
+                                       aga_gpu_bad_page_record_t *records,
+                                       void *ctxt);
+
+/// \brief    read gpu bad page records
+/// \param[in]  key     key of the gpu object, if k_aga_obj_key_invalid we read
+///                     bad page records of all gpu
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_bad_page_read(_In_ aga_obj_key_t *key,
+              _In_ gpu_bad_page_read_cb_t gpu_bad_page_read_cb,
+              _In_ void *ctxt);
+
+typedef void (*device_topology_read_cb_t)(aga_device_topology_info_t *info,
+                                          void *ctxt);
+
+/// \brief    read all gpu topology information
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_read_topology_all(device_topology_read_cb_t gpu_read_cb,
+                                    void *ctxt);
+
+/// \brief     update gpu
+/// \param[in] spec specification
+/// \return    #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_update(_In_ aga_gpu_spec_t *spec);
+
+/// \brief     delete gpu object
+/// \param[in] key key
+/// \return    #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_delete(_In_ aga_obj_key_t *key);
+
+typedef void (*gpu_cper_read_cb_t)(aga_cper_info_t *info, void *ctxt);
+
+/// \brief    read gpu CPER records
+/// \param[in]  key     key of the gpu object, if k_aga_obj_key_invalid we read
+///                     CPER records of all gpu
+/// \param[in]  cb      callback function
+/// \param[in]  ctxt    opaque context passed to cb
+/// \return #SDK_RET_OK on success, failure status code on error
+sdk_ret_t aga_gpu_cper_read(_In_ aga_obj_key_t *key,
+                            _In_ aga_cper_severity_t severity,
+                            _In_ gpu_cper_read_cb_t gpu_cper_read_cb,
+                            _In_ void *ctxt);
+
+#endif    /// __API_INCLUDE_AGA_GPU_HPP__
diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
index 3e0b180..d6fbb1a 100644
--- a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
+++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
@@ -940,6 +940,108 @@ smi_gpu_get_bad_page_records (void *gpu_obj,
     return SDK_RET_OK;
 }
 
+static sdk_ret_t
+smi_fill_violation_stats_ (aga_gpu_handle_t gpu_handle,
+                           uint32_t partition_id,
+                           amdsmi_gpu_metrics_t *metrics_info,
+                           aga_gpu_violation_stats_t *stats)
+{
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_violation_status_t violation_status = {};
+
+    // initialize stats to invalid values
+    memset(stats, 0xff, sizeof(aga_gpu_violation_stats_t));
+
+    amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+        if (!partition_id) {
+            // fill non partition stats from metrics info only
+            // for primary parition
+            stats->current_accumulated_counter =
+                metrics_info->accumulation_counter;
+            stats->processor_hot_residency_accumulated =
+                metrics_info->prochot_residency_acc;
+            stats->ppt_residency_accumulated =
+                metrics_info->ppt_residency_acc;
+            stats->socket_thermal_residency_accumulated =
+                metrics_info->socket_thm_residency_acc;
+            stats->vr_thermal_residency_accumulated =
+                metrics_info->vr_thm_residency_acc;
+            stats->hbm_thermal_residency_accumulated =
+                metrics_info->hbm_thm_residency_acc;
+        }
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+            stats->gfx_clk_below_host_limit_power_accumulated[i] =
+                metrics_info->xcp_stats[
+                    partition_id].gfx_below_host_limit_ppt_acc[i];
+            stats->gfx_clk_below_host_limit_thermal_accumulated[i] =
+                metrics_info->xcp_stats[
+                    partition_id].gfx_below_host_limit_thm_acc[i];
+            stats->gfx_low_utilization_accumulated[i] =
+                metrics_info->xcp_stats[
+                    partition_id].gfx_low_utilization_acc[i];
+            stats->gfx_clk_below_host_limit_total_accumulated[i] =
+                metrics_info->xcp_stats[
+                    partition_id].gfx_below_host_limit_total_acc[i];
+        }
+    } else {
+        if (!partition_id) {
+            // fill non partition stats from violation status only
+            // for primary parition
+            stats->current_accumulated_counter =
+                violation_status.acc_counter;
+            stats->processor_hot_residency_accumulated =
+                violation_status.acc_prochot_thrm;
+            stats->ppt_residency_accumulated =
+                violation_status.acc_ppt_pwr;
+            stats->socket_thermal_residency_accumulated =
+                violation_status.acc_socket_thrm;
+            stats->vr_thermal_residency_accumulated =
+                violation_status.acc_vr_thrm;
+            stats->hbm_thermal_residency_accumulated =
+                violation_status.acc_hbm_thrm;
+            stats->processor_hot_residency_percentage =
+                violation_status.per_prochot_thrm;
+            stats->ppt_residency_percentage =
+                violation_status.per_ppt_pwr;
+            stats->socket_thermal_residency_percentage =
+                violation_status.per_socket_thrm;
+            stats->vr_thermal_residency_percentage =
+                violation_status.per_vr_thrm;
+            stats->hbm_thermal_residency_percentage =
+                violation_status.per_hbm_thrm;
+        }
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+            stats->gfx_clk_below_host_limit_power_accumulated[i] =
+                violation_status.acc_gfx_clk_below_host_limit_pwr[
+                    partition_id][i];
+            stats->gfx_clk_below_host_limit_thermal_accumulated[i] =
+                violation_status.acc_gfx_clk_below_host_limit_thm[
+                    partition_id][i];
+            stats->gfx_low_utilization_accumulated[i] =
+                violation_status.acc_low_utilization[partition_id][i];
+            stats->gfx_clk_below_host_limit_total_accumulated[i] =
+                violation_status.acc_gfx_clk_below_host_limit_total[
+                    partition_id][i];
+            stats->gfx_clk_below_host_limit_power_percentage[i] =
+                violation_status.per_gfx_clk_below_host_limit_pwr[
+                    partition_id][i];
+            stats->gfx_clk_below_host_limit_thermal_percentage[i] =
+                violation_status.per_gfx_clk_below_host_limit_thm[
+                    partition_id][i];
+            stats->gfx_low_utilization_percentage[i] =
+                violation_status.per_low_utilization[
+                    partition_id][i];
+            stats->gfx_clk_below_host_limit_total_percentage[i] =
+                violation_status.per_gfx_clk_below_host_limit_total[
+                    partition_id][i];
+        }
+    }
+    return SDK_RET_OK;
+}
+
 static sdk_ret_t
 smi_fill_vram_usage_ (aga_gpu_handle_t gpu_handle,
                       aga_gpu_vram_usage_t *usage)
@@ -1016,7 +1118,6 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
     amdsmi_status_t amdsmi_ret;
     uint64_t sent, received, max_pkt_size;
     amdsmi_gpu_metrics_t metrics_info = {};
-    amdsmi_violation_status_t violation_status = {};
 
     // fill VRAM usage
     smi_fill_vram_usage_(gpu_handle, &stats->vram_usage);
@@ -1045,36 +1146,10 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
                 metrics_info.xgmi_write_data_acc[i];
         }
         // fill violation statistics
-        amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status);
-        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
-            AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}",
-                          gpu_handle, amdsmi_ret);
-            // revert to populating from metrics payload
-            stats->violation_stats.current_accumulated_counter =
-                metrics_info.accumulation_counter;
-            stats->violation_stats.processor_hot_residency_accumulated =
-                metrics_info.prochot_residency_acc;
-            stats->violation_stats.ppt_residency_accumulated =
-                metrics_info.ppt_residency_acc;
-            stats->violation_stats.socket_thermal_residency_accumulated =
-                metrics_info.socket_thm_residency_acc;
-            stats->violation_stats.vr_thermal_residency_accumulated =
-                metrics_info.vr_thm_residency_acc;
-            stats->violation_stats.hbm_thermal_residency_accumulated =
-                metrics_info.hbm_thm_residency_acc;
-        } else {
-            stats->violation_stats.current_accumulated_counter =
-                violation_status.acc_counter;
-            stats->violation_stats.processor_hot_residency_accumulated =
-                violation_status.acc_prochot_thrm;
-            stats->violation_stats.ppt_residency_accumulated =
-                violation_status.acc_ppt_pwr;
-            stats->violation_stats.socket_thermal_residency_accumulated =
-                violation_status.acc_socket_thrm;
-            stats->violation_stats.vr_thermal_residency_accumulated =
-                violation_status.acc_vr_thrm;
-            stats->violation_stats.hbm_thermal_residency_accumulated =
-                violation_status.acc_hbm_thrm;
+        if (!partition_id) {
+            smi_fill_violation_stats_(gpu_handle, partition_id,
+                                      &metrics_info,
+                                      &stats->violation_stats);
         }
         // get usage information from the metrics info for partition 0
         for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) {
@@ -1173,6 +1248,10 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
                     metrics_info.xcp_stats[partition_id].gfx_busy_inst[i];
             }
         }
+        // get violation stats from first gpu partition for XCP level data
+        smi_fill_violation_stats_(first_partition_handle, partition_id,
+                                  &metrics_info,
+                                  &stats->violation_stats);
     }
     return SDK_RET_OK;
 }
diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig
new file mode 100644
index 0000000..3e0b180
--- /dev/null
+++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc.orig
@@ -0,0 +1,1863 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+//----------------------------------------------------------------------------
+///
+/// \file
+/// smi layer API definitions
+///
+//----------------------------------------------------------------------------
+
+#include <sstream>
+#include <iomanip>
+extern "C" {
+#include "nic/third-party/rocm/amd_smi_lib/include/amd_smi/amdsmi.h"
+}
+#include "nic/sdk/include/sdk/base.hpp"
+#include "nic/gpuagent/core/trace.hpp"
+#include "nic/gpuagent/api/gpu.hpp"
+#include "nic/gpuagent/api/aga_state.hpp"
+#include "nic/gpuagent/api/smi/smi_api.hpp"
+#include "nic/gpuagent/api/smi/smi_state.hpp"
+#include "nic/gpuagent/api/smi/amdsmi/smi_utils.hpp"
+
+// TODO:
+// not using aga_ here for proper naming !!!
+
+namespace aga {
+
+#define AMDSMI_INVALID_PARTITION_COUNT  0xffff
+#define AMDSMI_INVALID_UINT16           0xffff
+#define AMDSMI_INVALID_UINT32           0xffffffff
+#define AMDSMI_INVALID_UINT64           0xffffffffffffffff
+#define AMDSMI_DEEP_SLEEP_THRESHOLD     140
+#define AMDSMI_COUNTER_RESOLUTION       15.3
+#define CPER_BUF_SIZE                   (4 * 1024 * 1024) // 4 MB
+
+
+/// cache GPU metrics so that we don't do repeated calls while filling spec,
+/// status and statistics
+std::unordered_map<aga_gpu_handle_t, amdsmi_gpu_metrics_t> g_gpu_metrics;
+/// counter resolution in uJ; this is a constant value that we get once during
+/// init time and use whenever we want to calculate energy accumalated
+float g_energy_counter_resolution;
+
+/// \brief struct to be used as ctxt when walking GPU db to build topology
+typedef struct gpu_topo_walk_ctxt_s {
+    uint32_t count;
+    gpu_entry *gpu;
+    aga_device_topology_info_t *info;
+} gpu_topo_walk_ctxt_t;
+
+/// \brief    fill clock frequency ranges of the given GPU
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] spec     spec to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_clock_frequency_spec_ (aga_gpu_handle_t gpu_handle,
+                                    aga_gpu_spec_t *spec)
+{
+    uint32_t clk_cnt = 0;
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_frequencies_t freq = {};
+    amdsmi_clk_info_t clock_info = {};
+    aga_gpu_clock_freq_range_t *clock_spec;
+
+    // gfx clock
+    clock_spec = &spec->clock_freq[clk_cnt];
+    clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_GFX);
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_GFX, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get system clock frequencies for GPU {}, "
+                      "err {}", gpu_handle, amdsmi_ret);
+    } else {
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq, &clock_spec->lo, &clock_spec->hi);
+    }
+    clk_cnt++;
+    // memory clock
+    clock_spec = &spec->clock_freq[clk_cnt];
+    clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_MEM);
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_MEM, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get memory clock frequencies for GPU {}, "
+                      "err {}", gpu_handle, amdsmi_ret);
+    } else {
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq, &clock_spec->lo, &clock_spec->hi);
+    }
+    clk_cnt++;
+    // video clock
+    clock_spec = &spec->clock_freq[clk_cnt];
+    clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_VCLK0);
+    amdsmi_ret = amdsmi_get_clock_info(gpu_handle, AMDSMI_CLK_TYPE_VCLK0,
+                                       &clock_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get video clock information for GPU {}, "
+                      "err {}", gpu_handle, amdsmi_ret);
+    } else {
+        clock_spec->lo = clock_info.min_clk;
+        clock_spec->hi = clock_info.max_clk;
+    }
+    clk_cnt++;
+    // data clock
+    clock_spec = &spec->clock_freq[clk_cnt];
+    clock_spec->clock_type = smi_to_aga_gpu_clock_type(AMDSMI_CLK_TYPE_DCLK0);
+    amdsmi_ret = amdsmi_get_clock_info(gpu_handle, AMDSMI_CLK_TYPE_DCLK0,
+                                       &clock_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get data clock information for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        clock_spec->lo = clock_info.min_clk;
+        clock_spec->hi = clock_info.max_clk;
+    }
+    clk_cnt++;
+    spec->num_clock_freqs = clk_cnt;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_spec (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec)
+{
+    uint32_t value_32;
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_dev_perf_level_t perf_level = {};
+    amdsmi_gpu_metrics_t metrics_info = { 0 };
+    amdsmi_power_cap_info_t power_cap_info = {};
+
+    // clear cached responses
+    g_gpu_metrics.clear();
+
+    amdsmi_ret = amdsmi_get_gpu_metrics_info(gpu_handle, &metrics_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        // cache response
+        g_gpu_metrics[gpu_handle] = metrics_info;
+    }
+    // fill the overdrive level
+    amdsmi_ret = amdsmi_get_gpu_overdrive_level(gpu_handle, &value_32);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get clock overdrive for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        spec->overdrive_level = value_32;
+    }
+    // fill the perf level
+    amdsmi_ret = amdsmi_get_gpu_perf_level(gpu_handle, &perf_level);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get performance level GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        spec->perf_level = smi_to_aga_gpu_perf_level(perf_level);
+    }
+    // fill the power cap
+    amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0, &power_cap_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get power cap information for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        spec->gpu_power_cap = power_cap_info.power_cap/1000000;
+    }
+    // TODO: get admin_state
+    // TODO: get RAS spec
+    return SDK_RET_OK;
+}
+
+/// \brief     function to get name for amdsmi firmware block enum
+/// \param[in] block    amdsmi firmware block enum
+/// \return    firmware block name
+static inline const char *
+gpu_fw_block_name_str_ (amdsmi_fw_block_t block)
+{
+    switch (block) {
+    case AMDSMI_FW_ID_SMU:
+        return "SMU";
+    case AMDSMI_FW_ID_CP_CE:
+        return "CP_CE";
+    case AMDSMI_FW_ID_CP_PFP:
+        return "CP_PFP";
+    case AMDSMI_FW_ID_CP_ME:
+        return "CP_ME";
+    case AMDSMI_FW_ID_CP_MEC_JT1:
+        return "CP_MEC_JT1";
+    case AMDSMI_FW_ID_CP_MEC_JT2:
+        return "CP_MEC_JT2";
+    case AMDSMI_FW_ID_CP_MEC1:
+        return "CP_MEC1";
+    case AMDSMI_FW_ID_CP_MEC2:
+        return "CP_MEC2";
+    case AMDSMI_FW_ID_RLC:
+        return "RLC";
+    case AMDSMI_FW_ID_SDMA0:
+        return "SDMA0";
+    case AMDSMI_FW_ID_SDMA1:
+        return "SDMA1";
+    case AMDSMI_FW_ID_SDMA2:
+        return "SDMA2";
+    case AMDSMI_FW_ID_SDMA3:
+        return "SDMA3";
+    case AMDSMI_FW_ID_SDMA4:
+        return "SDMA4";
+    case AMDSMI_FW_ID_SDMA5:
+        return "SDMA5";
+    case AMDSMI_FW_ID_SDMA6:
+        return "SDMA6";
+    case AMDSMI_FW_ID_SDMA7:
+        return "SDMA7";
+    case AMDSMI_FW_ID_VCN:
+        return "VCN";
+    case AMDSMI_FW_ID_UVD:
+        return "UVD";
+    case AMDSMI_FW_ID_VCE:
+        return "VCE";
+    case AMDSMI_FW_ID_ISP:
+        return "ISP";
+    case AMDSMI_FW_ID_DMCU_ERAM:
+        return "DMCU_ERAM";
+    case AMDSMI_FW_ID_DMCU_ISR:
+        return "DMCU_ISR";
+    case AMDSMI_FW_ID_RLC_RESTORE_LIST_GPM_MEM:
+        return "RLC_GPM_MEM";
+    case AMDSMI_FW_ID_RLC_RESTORE_LIST_SRM_MEM:
+        return "RLC_SRM_MEM";
+    case AMDSMI_FW_ID_RLC_RESTORE_LIST_CNTL:
+        return "RLC_CNTL";
+    case AMDSMI_FW_ID_RLC_V:
+        return "RLC_V";
+    case AMDSMI_FW_ID_MMSCH:
+        return "MMSCH";
+    case AMDSMI_FW_ID_PSP_SYSDRV:
+        return "PSP_SYSDRV";
+    case AMDSMI_FW_ID_PSP_SOSDRV:
+        return "PSP_SOSDRV";
+    case AMDSMI_FW_ID_PSP_TOC:
+        return "PSP_TOC";
+    case AMDSMI_FW_ID_PSP_KEYDB:
+        return "PSP_KEYDB";
+    case AMDSMI_FW_ID_DFC:
+        return "DFC";
+    case AMDSMI_FW_ID_PSP_SPL:
+        return "PSP_SPL";
+    case AMDSMI_FW_ID_DRV_CAP:
+        return "DRV_CAP";
+    case AMDSMI_FW_ID_MC:
+        return "MC";
+    case AMDSMI_FW_ID_PSP_BL:
+        return "PSP_BL";
+    case AMDSMI_FW_ID_CP_PM4:
+        return "CP_PM4";
+    case AMDSMI_FW_ID_RLC_P:
+        return "RLC_P";
+    case AMDSMI_FW_ID_SEC_POLICY_STAGE2:
+        return "SEC_POL_STG2";
+    case AMDSMI_FW_ID_REG_ACCESS_WHITELIST:
+        return "REG_ACCESS_WL";
+    case AMDSMI_FW_ID_IMU_DRAM:
+        return "IMU_DRAM";
+    case AMDSMI_FW_ID_IMU_IRAM:
+        return "IMU_IRAM";
+    case AMDSMI_FW_ID_SDMA_TH0:
+        return "SDMA_TH0";
+    case AMDSMI_FW_ID_SDMA_TH1:
+        return "SDMA_TH1";
+    case AMDSMI_FW_ID_CP_MES:
+        return "CP_MES";
+    case AMDSMI_FW_ID_MES_KIQ:
+        return "MES_KIQ";
+    case AMDSMI_FW_ID_MES_STACK:
+        return "MES_STACK";
+    case AMDSMI_FW_ID_MES_THREAD1:
+        return "MES_THREAD1";
+    case AMDSMI_FW_ID_MES_THREAD1_STACK:
+        return "MES_THREAD1_STACK";
+    case AMDSMI_FW_ID_RLX6:
+        return "RLX6";
+    case AMDSMI_FW_ID_RLX6_DRAM_BOOT:
+        return "RLX6_DRAM_BOOT";
+    case AMDSMI_FW_ID_RS64_ME:
+        return "RS64_ME";
+    case AMDSMI_FW_ID_RS64_ME_P0_DATA:
+        return "RS64_ME_P0_DATA";
+    case AMDSMI_FW_ID_RS64_ME_P1_DATA:
+        return "RS64_ME_P1_DATA";
+    case AMDSMI_FW_ID_RS64_PFP:
+        return "RS64_PFP";
+    case AMDSMI_FW_ID_RS64_PFP_P0_DATA:
+        return "RS64_PFP_P0_DATA";
+    case AMDSMI_FW_ID_RS64_PFP_P1_DATA:
+        return "RS64_PFP_P1_DATA";
+    case AMDSMI_FW_ID_RS64_MEC:
+        return "RS64_MEC";
+    case AMDSMI_FW_ID_RS64_MEC_P0_DATA:
+        return "RS64_MEC_P0_DATA";
+    case AMDSMI_FW_ID_RS64_MEC_P1_DATA:
+        return "RS64_MEC_P1_DATA";
+    case AMDSMI_FW_ID_RS64_MEC_P2_DATA:
+        return "RS64_MEC_P2_DATA";
+    case AMDSMI_FW_ID_RS64_MEC_P3_DATA:
+        return "RS64_MEC_P3_DATA";
+    case AMDSMI_FW_ID_PPTABLE:
+        return "PPTABLE";
+    case AMDSMI_FW_ID_PSP_SOC:
+        return "PSP_SOC";
+    case AMDSMI_FW_ID_PSP_DBG:
+        return "PSP_DBG";
+    case AMDSMI_FW_ID_PSP_INTF:
+        return "PSP_INTF";
+    case AMDSMI_FW_ID_RLX6_CORE1:
+        return "RLX6_CORE1";
+    case AMDSMI_FW_ID_RLX6_DRAM_BOOT_CORE1:
+        return "RLX6_DRAM_BOOT_CORE1";
+    case AMDSMI_FW_ID_RLCV_LX7:
+        return "RLCV_LX7";
+    case AMDSMI_FW_ID_RLC_SAVE_RESTORE_LIST:
+        return "RLC_SAVE_RL";
+    case AMDSMI_FW_ID_ASD:
+        return "ASD";
+    case AMDSMI_FW_ID_TA_RAS:
+        return "TA_RAS";
+    case AMDSMI_FW_ID_TA_XGMI:
+        return "TA_XGMI";
+    case AMDSMI_FW_ID_RLC_SRLG:
+        return "RLC_SRLG";
+    case AMDSMI_FW_ID_RLC_SRLS:
+        return "RLC_SRLS";
+    case AMDSMI_FW_ID_PM:
+        return "PM";
+    case AMDSMI_FW_ID_DMCU:
+        return "DMCU";
+    case AMDSMI_FW_ID_PLDM_BUNDLE:
+        return "PLDM_BUNDLE";
+    default:
+        return (std::string("FW_ID_")+ std::to_string(block)).c_str();
+    }
+}
+
+/// \brief      function to format firmware version
+/// \param[out] fw_version    firmware component/version after formatting
+/// \param[in]  block         firmware component enum
+/// \param[in]  version       firmware version
+/// \return     none
+static void
+fill_gpu_fw_version_ (aga_gpu_fw_version_t *fw_version, amdsmi_fw_block_t block,
+                      uint64_t version)
+{
+    char buf[AGA_MAX_STR_LEN + 1];
+    std::string block_name = gpu_fw_block_name_str_(block);
+
+    strncpy(fw_version->firmware, block_name.c_str(), AGA_MAX_STR_LEN);
+    if ((block == AMDSMI_FW_ID_VCN) || (block == AMDSMI_FW_ID_UVD) ||
+        (block == AMDSMI_FW_ID_VCE) ||
+        (block == AMDSMI_FW_ID_ASD) || (block == AMDSMI_FW_ID_CP_MES) ||
+        (block == AMDSMI_FW_ID_MES_KIQ) || (block == AMDSMI_FW_ID_PSP_SOSDRV)) {
+        // 'VCN', 'VCE', 'UVD', 'SOS', 'ASD', 'MES', 'MES KIQ' fw versions
+        // needs to hexadecimal
+        snprintf(buf, AGA_MAX_STR_LEN, "0x%08" PRIx64, version);
+        strncpy(fw_version->version, buf, AGA_MAX_STR_LEN);
+    } else if ((block == AMDSMI_FW_ID_TA_XGMI) ||
+               (block == AMDSMI_FW_ID_TA_RAS) || (block == AMDSMI_FW_ID_PM)) {
+        // TA XGMI, TA RAS, and PM firmware's hex value looks like 0x12345678
+        // however, they are parsed as: int(0x12).int(0x34).int(0x56).int(0x78)
+        // which results in the following: 12.34.56.78
+        unsigned char tmp[8];
+        for (auto i = 0; i < 8; i++) {
+            tmp[i] = version >> ((7-i)*8);
+        }
+        snprintf(buf, AGA_MAX_STR_LEN, "%02u.%02u.%02u.%02u",
+                 tmp[4], tmp[5], tmp[6], tmp[7]);
+        strncpy(fw_version->version, buf, AGA_MAX_STR_LEN);
+    } else {
+        strncpy(fw_version->version, std::to_string(version).c_str(),
+                AGA_MAX_STR_LEN);
+    }
+}
+
+/// \brief      get SKU from VBIOS version
+/// \param[in]  vbios    VBIOS part number string
+/// \param[out] sku          SKU string dervied from vbios version
+/// \return     none
+static void
+gpu_get_sku_from_vbios_ (char *sku, char *vbios)
+{
+    char *buf;
+    char *token;
+
+    // middle portion in the VBIOS version is SKU XXX-<CARD_SKU>-XXX
+    // get first token
+    token = strtok_r(vbios, "-", &buf);
+    if (token == NULL) {
+        AGA_TRACE_ERR("SKU cannot be derived from vbios version {}", vbios);
+        return;
+    }
+    // second token is the SKU
+    token = strtok_r(NULL, "-", &buf);
+    if (token == NULL) {
+        AGA_TRACE_ERR("SKU cannot be derived from vbios version {}", vbios);
+        return;
+    }
+    strncpy(sku, token, AGA_MAX_STR_LEN);
+}
+
+/// \brief    fill GPU enumeration ids info using the given GPU
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_enumeration_id_status_ (aga_gpu_handle_t gpu_handle,
+                                     aga_gpu_status_t *status)
+{
+    amdsmi_kfd_info_t k_info;
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_enumeration_info_t e_info;
+
+    amdsmi_ret = amdsmi_get_gpu_kfd_info(gpu_handle, &k_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get kfd info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+        return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+    }
+    amdsmi_ret = amdsmi_get_gpu_enumeration_info(gpu_handle, &e_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get enumeration info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+        return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+    }
+    status->kfd_id = k_info.kfd_id;
+    status->node_id = k_info.node_id;
+    status->drm_render_id = e_info.drm_render;
+    status->drm_card_id = e_info.drm_card;
+    return SDK_RET_OK;
+}
+
+/// \brief    fill list of pids using the given GPU
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_kfd_pid_status_ (aga_gpu_handle_t gpu_handle,
+                              uint32_t gpu_id, aga_gpu_status_t *status)
+{
+    amdsmi_status_t amdsmi_ret;
+    uint32_t gpu_list[AGA_MAX_GPU];
+    amdsmi_process_info_t *pid_info;
+    uint32_t value_32, num_pid = 0, num_gpus = AGA_MAX_GPU;
+
+    // kernel fusion driver pids
+    amdsmi_ret = amdsmi_get_gpu_compute_process_info(NULL, &value_32);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get KFD pid count, err {}", amdsmi_ret);
+        return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+    } else {
+        // if pid count is non zero, get the pid info
+        if (value_32) {
+            pid_info =
+                (amdsmi_process_info_t *)malloc(sizeof(amdsmi_process_info_t) *
+                                                value_32);
+            if (pid_info == NULL) {
+                AGA_TRACE_ERR("Failed to allocate KFD pid buffer, GPU {}");
+                return SDK_RET_OOM;
+            }
+            amdsmi_ret = amdsmi_get_gpu_compute_process_info(pid_info,
+                                                             &value_32);
+            if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+                free(pid_info);
+                AGA_TRACE_ERR("Failed to get KFD pid info, err {}", amdsmi_ret);
+                return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+            }
+            // loop thru pids, get the list of GPUs using each pid and
+            // update per GPU kfd process list
+            for (uint32_t i = 0; i < value_32; i++) {
+                num_gpus = AGA_MAX_GPU;
+                amdsmi_ret =
+                    amdsmi_get_gpu_compute_process_gpus(pid_info[i].process_id,
+                                                        gpu_list, &num_gpus);
+                if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+                    AGA_TRACE_ERR("Failed to get GPU list of pid {}, err {}",
+                                  pid_info[i].process_id, amdsmi_ret);
+                    continue;
+                }
+                for (uint32_t j = 0; j < num_gpus; j++) {
+                    if (gpu_list[j] == gpu_id) {
+                        if (num_pid == (AGA_GPU_MAX_KFD_PID - 1)) {
+                            AGA_TRACE_DEBUG("Reached max KFD processes {} "
+                                            "using the GPU {}, pid {} is "
+                                            "ignored", AGA_GPU_MAX_KFD_PID,
+                                            gpu_handle, pid_info[i].process_id);
+                            break;
+                        }
+                        status->kfd_process_id[num_pid++] =
+                            pid_info[i].process_id;
+                        break;
+                    }
+                }
+            }
+            status->num_kfd_process_id = num_pid;
+            // free pid_info memory
+            free(pid_info);
+        }
+    }
+    return SDK_RET_OK;
+}
+
+/// \brief    fill status of clocks
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle,
+                        aga_gpu_spec_t *spec, aga_gpu_status_t *status,
+                        amdsmi_gpu_metrics_t *metrics_info)
+{
+    uint32_t clk_cnt = 0;
+    amdsmi_status_t amdsmi_ret;
+    uint32_t low_freq, high_freq;
+    amdsmi_frequencies_t freq = {};
+    aga_gpu_clock_status_t *clock_status;
+    aga_gpu_clock_freq_range_t *mem_clock_spec = NULL;
+    aga_gpu_clock_freq_range_t *gfx_clock_spec = NULL;
+    aga_gpu_clock_freq_range_t *data_clock_spec = NULL;
+    aga_gpu_clock_freq_range_t *video_clock_spec = NULL;
+
+    // get clock specs for different clock types
+    for (uint32_t i = 0; i < spec->num_clock_freqs; i++) {
+        if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_SYSTEM) {
+            gfx_clock_spec = &spec->clock_freq[i];
+            break;
+        }
+    }
+    for (uint32_t i = 0; i < spec->num_clock_freqs; i++) {
+        if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_MEMORY) {
+            mem_clock_spec = &spec->clock_freq[i];
+            break;
+        }
+    }
+    for (uint32_t i = 0; i < spec->num_clock_freqs; i++) {
+        if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_VIDEO) {
+            video_clock_spec = &spec->clock_freq[i];
+            break;
+        }
+    }
+    for (uint32_t i = 0; i < spec->num_clock_freqs; i++) {
+        if (spec->clock_freq[i].clock_type == AGA_GPU_CLOCK_TYPE_DATA) {
+            data_clock_spec = &spec->clock_freq[i];
+            break;
+        }
+    }
+    clk_cnt = 0;
+    // gfx clock
+    for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) {
+        if (gfx_clock_spec) {
+            clock_status = &status->clock_status[clk_cnt];
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM;
+            clock_status->frequency = metrics_info->current_gfxclks[i];
+            clock_status->low_frequency = gfx_clock_spec->lo;
+            clock_status->high_frequency = gfx_clock_spec->hi;
+            clock_status->locked =
+                metrics_info->gfxclk_lock_status & (1 << i);
+            clock_status->deep_sleep =
+                (clock_status->frequency < clock_status->low_frequency);
+        }
+        clk_cnt++;
+    }
+    // memory clock
+    if (mem_clock_spec) {
+        clock_status = &status->clock_status[clk_cnt];
+        clock_status->clock_type = AGA_GPU_CLOCK_TYPE_MEMORY;
+        clock_status->frequency = metrics_info->current_uclk;
+        clock_status->low_frequency = mem_clock_spec->lo;
+        clock_status->high_frequency = mem_clock_spec->hi;
+        // locked is N/A for memory clock
+        clock_status->deep_sleep =
+            (clock_status->frequency < clock_status->low_frequency);
+    }
+    clk_cnt++;
+    // video clocks
+    for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) {
+        if (video_clock_spec) {
+            clock_status = &status->clock_status[clk_cnt];
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_VIDEO;
+            clock_status->frequency = metrics_info->current_vclk0s[i];
+            clock_status->low_frequency = video_clock_spec->lo;
+            clock_status->high_frequency = video_clock_spec->hi;
+            // locked is N/A for video clocks
+            clock_status->deep_sleep =
+                (clock_status->frequency < clock_status->low_frequency);
+        }
+        clk_cnt++;
+    }
+    // data clocks
+    for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) {
+        if (data_clock_spec) {
+            clock_status = &status->clock_status[clk_cnt];
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DATA;
+            clock_status->frequency = metrics_info->current_dclk0s[i];
+            clock_status->low_frequency = data_clock_spec->lo;
+            clock_status->high_frequency = data_clock_spec->hi;
+            // locked is N/A for data clocks
+            clock_status->deep_sleep =
+                (clock_status->frequency < clock_status->low_frequency);
+        }
+        clk_cnt++;
+    }
+    // SOC clock
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_SOC, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get SOC clock frequencies for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        low_freq = high_freq = 0;
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq, &low_freq, &high_freq);
+        for (uint32_t i = 0; i < AMDSMI_MAX_NUM_CLKS; i++) {
+            clock_status = &status->clock_status[clk_cnt];
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SOC;
+            clock_status->frequency = metrics_info->current_socclks[i];
+            clock_status->low_frequency = low_freq;
+            clock_status->high_frequency = high_freq;
+            // locked is N/A for SOC clocks
+            clock_status->deep_sleep =
+                (clock_status->frequency < clock_status->low_frequency);
+            clk_cnt++;
+        }
+    }
+    // data fabric clock
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DF, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get data fabric clock frequencies for GPU {}, "
+                      "err {}", gpu_handle, amdsmi_ret);
+    } else {
+        low_freq = high_freq = 0;
+        clock_status = &status->clock_status[clk_cnt];
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq,
+                                &clock_status->low_frequency,
+                                &clock_status->high_frequency);
+        clock_status->clock_type = AGA_GPU_CLOCK_TYPE_FABRIC;
+        clock_status->frequency = freq.frequency[freq.current]/1000000;
+        clock_status->deep_sleep =
+            (clock_status->frequency < clock_status->low_frequency);
+        clk_cnt++;
+    }
+    // DCE clock
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DCEF, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get DCE clock frequencies for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        low_freq = high_freq = 0;
+        clock_status = &status->clock_status[clk_cnt];
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq,
+                                &clock_status->low_frequency,
+                                &clock_status->high_frequency);
+        clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DCE;
+        clock_status->frequency = freq.frequency[freq.current]/1000000;
+        clock_status->deep_sleep =
+            (clock_status->frequency < clock_status->low_frequency);
+        clk_cnt++;
+    }
+    // PCIe clock
+    amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_PCIE, &freq);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get PCIe clock frequencies for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        low_freq = high_freq = 0;
+        clock_status = &status->clock_status[clk_cnt];
+        // min and max frequencies are per clock type
+        find_low_high_frequency(&freq,
+                                &clock_status->low_frequency,
+                                &clock_status->high_frequency);
+        clock_status->clock_type = AGA_GPU_CLOCK_TYPE_PCIE;
+        clock_status->frequency = freq.frequency[freq.current]/1000000;
+        clock_status->deep_sleep =
+            (clock_status->frequency < clock_status->low_frequency);
+        clk_cnt++;
+    }
+    status->num_clock_status = clk_cnt;
+    return SDK_RET_OK;
+}
+
+/// \brief    fill PCIe status
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_pcie_status_ (aga_gpu_handle_t gpu_handle,
+                       aga_gpu_status_t *status)
+{
+    amdsmi_pcie_info_t info;
+    amdsmi_status_t amdsmi_ret;
+    aga_gpu_pcie_status_t *pcie_status = &status->pcie_status;
+
+    amdsmi_ret = amdsmi_get_pcie_info(gpu_handle, &info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get PCIe info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        pcie_status->slot_type =
+            smi_to_aga_pcie_slot_type(info.pcie_static.slot_type);
+        pcie_status->max_width = info.pcie_static.max_pcie_width;
+        pcie_status->max_speed = info.pcie_static.max_pcie_speed/1000;
+        pcie_status->version = info.pcie_static.pcie_interface_version;
+        pcie_status->width = info.pcie_metric.pcie_width;
+        pcie_status->speed = info.pcie_metric.pcie_speed/1000;
+        pcie_status->bandwidth = info.pcie_metric.pcie_bandwidth;
+    }
+    return SDK_RET_OK;
+}
+
+/// \brief    fill VRAM status
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_vram_status_ (aga_gpu_handle_t gpu_handle,
+                       aga_gpu_vram_status_t *status)
+{
+    amdsmi_vram_info_t info;
+    amdsmi_status_t amdsmi_ret;
+
+    amdsmi_ret = amdsmi_get_gpu_vram_info(gpu_handle, &info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get VRAM info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        status->type = smi_to_aga_vram_type(info.vram_type);
+        memcpy(status->vendor, info.vram_vendor, AGA_MAX_STR_LEN);
+        status->size = info.vram_size;
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_get_gpu_partition_info (aga_gpu_handle_t gpu_handle, bool *capable,
+                            aga_gpu_compute_partition_type_t *compute_partition,
+                            aga_gpu_memory_partition_type_t *memory_partition)
+{
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_gpu_metrics_t metrics_info = {};
+    char partition_type[AGA_MAX_STR_LEN + 1];
+
+    *capable = true;
+    *compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_NONE;
+    *memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NONE;
+    // to deduce partition capability of platform, we rely on
+    // metrics field num_partition of a GPU field to be 0xffff
+    // on partition supported platform, this api is not supported
+    // for paritioned GPU other than index 0 or first_handle
+    // we mark the capablity to true on such cases to specify platform
+    // partition capability
+    amdsmi_ret = amdsmi_get_gpu_metrics_info(gpu_handle,
+                                             &metrics_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        if ((metrics_info.num_partition & 0xffff) ==
+            AMDSMI_INVALID_PARTITION_COUNT) {
+            // this is unsupported platform like Mi2xx
+            *capable = false;
+        }
+    }
+    // fill compute partition type
+    amdsmi_ret = amdsmi_get_gpu_compute_partition(gpu_handle,
+                     partition_type, AGA_MAX_STR_LEN + 1);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get compute partition for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        *compute_partition =
+            smi_to_aga_gpu_compute_partition_type(partition_type);
+    }
+    // fill memory partition type
+    amdsmi_ret = amdsmi_get_gpu_memory_partition(gpu_handle,
+                     partition_type, AGA_MAX_STR_LEN + 1);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get memory partition for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        *memory_partition =
+            smi_to_aga_gpu_memory_partition_type(partition_type);
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_get_gpu_partition_id (aga_gpu_handle_t gpu_handle, uint32_t *partition_id)
+{
+    amdsmi_status_t status;
+    amdsmi_kfd_info_t kfd_info;
+
+    status = amdsmi_get_gpu_kfd_info(gpu_handle, &kfd_info);
+    if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get partition id of GPU {}, err {}",
+                      gpu_handle, status);
+        return amdsmi_ret_to_sdk_ret(status);
+    }
+    *partition_id = kfd_info.current_partition_id;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_status (aga_gpu_handle_t gpu_handle, uint32_t gpu_id,
+                     aga_gpu_spec_t *spec, aga_gpu_status_t *status)
+{
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_xgmi_status_t xgmi_st;
+    amdsmi_od_volt_freq_data_t vc_data;
+    amdsmi_gpu_metrics_t metrics_info = { 0 };
+
+    if (g_gpu_metrics.find(gpu_handle) != g_gpu_metrics.end()) {
+        metrics_info = g_gpu_metrics[gpu_handle];
+        // fill the clock status with metrics info
+        smi_fill_clock_status_(gpu_handle, spec, status, &metrics_info);
+        // fill firmware timestamp
+        status->fw_timestamp = metrics_info.firmware_timestamp;
+        if (metrics_info.throttle_status !=
+            std::numeric_limits<uint32_t>::max()) {
+            status->throttling_status =
+                metrics_info.throttle_status ? AGA_GPU_THROTTLING_STATUS_ON :
+                                               AGA_GPU_THROTTLING_STATUS_OFF;
+        }
+        status->xgmi_status.width = metrics_info.xgmi_link_width;
+        status->xgmi_status.speed = metrics_info.xgmi_link_speed;
+    } else {
+        AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    }
+    // fill the PCIe status
+    smi_fill_pcie_status_(gpu_handle, status);
+    // fill the xgmi error count
+    amdsmi_ret = amdsmi_gpu_xgmi_error_status(gpu_handle, &xgmi_st);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get xgmi error status for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        status->xgmi_status.error_status = smi_to_aga_gpu_xgmi_error(xgmi_st);
+    }
+    // fill the voltage curve points
+    amdsmi_ret = amdsmi_get_gpu_od_volt_info(gpu_handle, &vc_data);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get voltage curve points for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        for (uint32_t i = 0;
+             (i < AGA_GPU_MAX_VOLTAGE_CURVE_POINT) &&
+             (i < AMDSMI_NUM_VOLTAGE_CURVE_POINTS); i++) {
+            status->voltage_curve_point[i].point = i;
+            status->voltage_curve_point[i].frequency =
+                vc_data.curve.vc_points[i].frequency/1000000;
+            status->voltage_curve_point[i].voltage =
+                vc_data.curve.vc_points[i].voltage;
+        }
+    }
+    smi_fill_gpu_kfd_pid_status_(gpu_handle, gpu_id, status);
+    smi_fill_gpu_enumeration_id_status_(gpu_handle, status);
+    // TODO: oper status
+    // TODO: RAS status
+    return SDK_RET_OK;
+}
+
+/// \brief function to get number of bad pages for GPU
+/// \param[in]  gpu             GPU object
+/// \param[out] num_bad_pages   number of bad pages
+/// \return SDK_RET_OK or error code in case of failure
+sdk_ret_t
+smi_gpu_get_bad_page_count (void *gpu_obj,
+                            uint32_t *num_bad_pages)
+{
+    amdsmi_status_t amdsmi_ret;
+    gpu_entry *gpu = (gpu_entry *)gpu_obj;
+
+    // get number of bad page records
+    amdsmi_ret = amdsmi_get_gpu_bad_page_info(gpu->handle(),
+                                              num_bad_pages, NULL);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get bad page information for GPU {}, err {}",
+                      gpu->handle(), amdsmi_ret);
+        return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+    }
+    return SDK_RET_OK;
+}
+
+/// \brief function to get GPU bad page records
+/// \param[in]  gpu           GPU object
+/// \param[in]  num_bad_pages number of bad pages
+/// \param[out] records       GPU bad page records
+/// \return SDK_RET_OK or error code in case of failure
+sdk_ret_t
+smi_gpu_get_bad_page_records (void *gpu_obj,
+                              uint32_t num_bad_pages,
+                              aga_gpu_bad_page_record_t *records)
+{
+    amdsmi_status_t amdsmi_ret;
+    gpu_entry *gpu = (gpu_entry *)gpu_obj;
+    amdsmi_retired_page_record_t *bad_pages;
+
+    // allocate memory for bad pages
+    bad_pages =
+        (amdsmi_retired_page_record_t *)malloc(
+            num_bad_pages * sizeof(amdsmi_retired_page_record_t));
+    if (!bad_pages) {
+        AGA_TRACE_ERR("Failed to allocate memory for bad page information "
+                      "for GPU {}", gpu->key().str());
+        return SDK_RET_OOM;
+    }
+    // fill bad page records
+    amdsmi_ret = amdsmi_get_gpu_bad_page_info(gpu->handle(), &num_bad_pages,
+                                              bad_pages);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get bad page information for GPU {}, "
+                      "err {}", gpu->handle(), amdsmi_ret);
+        return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+    } else {
+        for (uint32_t i = 0; i < num_bad_pages; i ++) {
+            records[i].key = gpu->key();
+            records[i].page_address = bad_pages[i].page_address;
+            records[i].page_size = bad_pages[i].page_size;
+            records[i].page_status =
+                smi_to_aga_gpu_page_status(bad_pages[i].status);
+        }
+    }
+    // free memory
+    free(bad_pages);
+    return SDK_RET_OK;
+}
+
+static sdk_ret_t
+smi_fill_vram_usage_ (aga_gpu_handle_t gpu_handle,
+                      aga_gpu_vram_usage_t *usage)
+{
+    uint64_t value_64;
+    amdsmi_status_t amdsmi_ret;
+
+    amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle,
+                                             AMDSMI_MEM_TYPE_VRAM, &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get VRAM total memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->total_vram = value_64/1024/1024;
+    }
+    amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle,
+                                             AMDSMI_MEM_TYPE_VIS_VRAM,
+                                             &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get visible VRAM total memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->total_visible_vram = value_64/1024/1024;
+    }
+    amdsmi_ret = amdsmi_get_gpu_memory_total(gpu_handle,
+                                             AMDSMI_MEM_TYPE_GTT,
+                                             &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get GTT total memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->total_gtt = value_64/1024/1024;
+    }
+    amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle, AMDSMI_MEM_TYPE_VRAM,
+                                             &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get VRAM used memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->used_vram = value_64/1024/1024;
+    }
+    amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle,
+                                             AMDSMI_MEM_TYPE_VIS_VRAM,
+                                             &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get visible VRAM used memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->used_visible_vram = value_64/1024/1024;
+    }
+    amdsmi_ret = amdsmi_get_gpu_memory_usage(gpu_handle,
+                                             AMDSMI_MEM_TYPE_GTT,
+                                             &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get GTT used memory GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        usage->used_gtt = value_64/1024/1024;
+    }
+    usage->free_vram = usage->total_vram - usage->used_vram;
+    usage->free_visible_vram = usage->total_visible_vram -
+                                   usage->used_visible_vram;
+    usage->free_gtt = usage->total_gtt - usage->used_gtt;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
+                    bool partition_capable,
+                    uint32_t partition_id,
+                    aga_gpu_handle_t first_partition_handle,
+                    aga_gpu_stats_t *stats)
+{
+    amdsmi_status_t amdsmi_ret;
+    uint64_t sent, received, max_pkt_size;
+    amdsmi_gpu_metrics_t metrics_info = {};
+    amdsmi_violation_status_t violation_status = {};
+
+    // fill VRAM usage
+    smi_fill_vram_usage_(gpu_handle, &stats->vram_usage);
+    // fill additional statistics from gpu metrics
+    if (g_gpu_metrics.find(gpu_handle) != g_gpu_metrics.end()) {
+        metrics_info = g_gpu_metrics[gpu_handle];
+        // power and voltage
+        stats->avg_package_power = metrics_info.average_socket_power;
+        stats->package_power = metrics_info.current_socket_power;
+        stats->voltage.voltage = metrics_info.voltage_soc;
+        stats->voltage.gfx_voltage = metrics_info.voltage_gfx;
+        stats->voltage.memory_voltage = metrics_info.voltage_mem;
+        // fan speed
+        stats->fan_speed = metrics_info.current_fan_speed;
+        // activity information
+        stats->usage.gfx_activity = metrics_info.average_gfx_activity;
+        stats->usage.umc_activity = metrics_info.average_umc_activity;
+        stats->usage.mm_activity = metrics_info.average_mm_activity;
+        stats->gfx_activity_accumulated = metrics_info.gfx_activity_acc;
+        stats->mem_activity_accumulated = metrics_info.mem_activity_acc;
+        // xgmi link stats
+        for (uint32_t i = 0; i < AGA_GPU_MAX_XGMI_LINKS; i++) {
+            stats->xgmi_link_stats[i].data_read =
+                metrics_info.xgmi_read_data_acc[i];
+            stats->xgmi_link_stats[i].data_write =
+                metrics_info.xgmi_write_data_acc[i];
+        }
+        // fill violation statistics
+        amdsmi_ret = amdsmi_get_violation_status(gpu_handle, &violation_status);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get violation status for GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+            // revert to populating from metrics payload
+            stats->violation_stats.current_accumulated_counter =
+                metrics_info.accumulation_counter;
+            stats->violation_stats.processor_hot_residency_accumulated =
+                metrics_info.prochot_residency_acc;
+            stats->violation_stats.ppt_residency_accumulated =
+                metrics_info.ppt_residency_acc;
+            stats->violation_stats.socket_thermal_residency_accumulated =
+                metrics_info.socket_thm_residency_acc;
+            stats->violation_stats.vr_thermal_residency_accumulated =
+                metrics_info.vr_thm_residency_acc;
+            stats->violation_stats.hbm_thermal_residency_accumulated =
+                metrics_info.hbm_thm_residency_acc;
+        } else {
+            stats->violation_stats.current_accumulated_counter =
+                violation_status.acc_counter;
+            stats->violation_stats.processor_hot_residency_accumulated =
+                violation_status.acc_prochot_thrm;
+            stats->violation_stats.ppt_residency_accumulated =
+                violation_status.acc_ppt_pwr;
+            stats->violation_stats.socket_thermal_residency_accumulated =
+                violation_status.acc_socket_thrm;
+            stats->violation_stats.vr_thermal_residency_accumulated =
+                violation_status.acc_vr_thrm;
+            stats->violation_stats.hbm_thermal_residency_accumulated =
+                violation_status.acc_hbm_thrm;
+        }
+        // get usage information from the metrics info for partition 0
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) {
+            stats->usage.vcn_activity[i] = metrics_info.vcn_activity[i];
+            if (partition_capable) {
+                stats->usage.vcn_busy[i] =
+                    metrics_info.xcp_stats[partition_id].vcn_busy[i];
+            } else {
+                stats->usage.vcn_busy[i] = AMDSMI_INVALID_UINT16;
+            }
+        }
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG; i++) {
+            stats->usage.jpeg_activity[i] = metrics_info.jpeg_activity[i];
+        }
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG_ENG_V1; i++) {
+            if (partition_capable) {
+                stats->usage.jpeg_busy[i] =
+                    metrics_info.xcp_stats[partition_id].jpeg_busy[i];
+            } else {
+                stats->usage.jpeg_busy[i] = AMDSMI_INVALID_UINT16;
+            }
+        }
+        for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+            if (partition_capable) {
+                stats->usage.gfx_busy_inst[i] =
+                    metrics_info.xcp_stats[partition_id].gfx_busy_inst[i];
+            } else {
+                stats->usage.gfx_busy_inst[i] = AMDSMI_INVALID_UINT32;
+            }
+        }
+        // fill the energy consumed
+        stats->energy_consumed = metrics_info.energy_accumulator *
+                                     g_energy_counter_resolution;
+        // fill temperature
+        stats->temperature.edge_temperature =
+            (float)metrics_info.temperature_edge;
+        stats->temperature.junction_temperature =
+            (float)metrics_info.temperature_hotspot;
+        stats->temperature.memory_temperature =
+            (float)metrics_info.temperature_mem;
+        for (uint32_t i = 0; i < AGA_GPU_MAX_HBM; i++) {
+            stats->temperature.hbm_temperature[i] =
+                (float)metrics_info.temperature_hbm[i];
+        }
+        // pcie stats
+        stats->pcie_stats.replay_count = metrics_info.pcie_replay_count_acc;
+        stats->pcie_stats.recovery_count =
+            metrics_info.pcie_l0_to_recov_count_acc;
+        stats->pcie_stats.replay_rollover_count =
+            metrics_info.pcie_replay_rover_count_acc;
+        stats->pcie_stats.nack_sent_count =
+            metrics_info.pcie_nak_sent_count_acc;
+        stats->pcie_stats.nack_received_count =
+            metrics_info.pcie_nak_rcvd_count_acc;
+        stats->pcie_stats.bidir_bandwidth =
+            metrics_info.pcie_bandwidth_acc;
+
+        // PCIe throughput initialization to invalid value
+        stats->pcie_stats.tx_bytes = AMDSMI_INVALID_UINT64;
+        stats->pcie_stats.rx_bytes = AMDSMI_INVALID_UINT64;
+
+        amdsmi_ret = amdsmi_get_gpu_pci_throughput(gpu_handle, &sent, &received,
+                                                   &max_pkt_size);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get PCIe throughput for GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        } else {
+            stats->pcie_stats.tx_bytes = received;
+            stats->pcie_stats.rx_bytes = sent;
+        }
+    } else {
+        AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    }
+    // for GPU partitions which are not the first partition, we need to get
+    // usage information from the first partition
+    // partition
+    if (partition_id) {
+        // get gfx, vcn and jpeg usage from first gpu partition
+        amdsmi_ret = amdsmi_get_gpu_metrics_info(first_partition_handle,
+                                                 &metrics_info);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get GPU metrics info for GPU {}, err {}",
+                          first_partition_handle, amdsmi_ret);
+        } else {
+            for (uint16_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) {
+                stats->usage.vcn_busy[i] =
+                    metrics_info.xcp_stats[partition_id].vcn_busy[i];
+            }
+            for (uint16_t i = 0; i < AMDSMI_MAX_NUM_JPEG_ENG_V1; i++) {
+                stats->usage.jpeg_busy[i] =
+                    metrics_info.xcp_stats[partition_id].jpeg_busy[i];
+            }
+            for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+                stats->usage.gfx_busy_inst[i] =
+                    metrics_info.xcp_stats[partition_id].gfx_busy_inst[i];
+            }
+        }
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_event_read_all (aga_event_read_cb_t cb, void *ctxt)
+{
+    return g_smi_state.event_read(cb, ctxt);
+}
+
+sdk_ret_t
+smi_gpu_reset (aga_gpu_handle_t gpu_handle,
+               aga_gpu_reset_type_t reset_type)
+{
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_power_cap_info_t power_cap_info;
+
+    switch(reset_type) {
+    case AGA_GPU_RESET_TYPE_NONE:
+        // reset GPU itself
+        amdsmi_ret = amdsmi_reset_gpu(gpu_handle);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset GPU {}, err {}", gpu_handle,
+                          amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_CLOCK:
+        // reset overdrive
+        amdsmi_ret = amdsmi_set_gpu_overdrive_level(gpu_handle,
+                                                    AMDSMI_DEV_PERF_LEVEL_AUTO);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset overdrive, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        // setting perf level to auto seems to be reset clocks as well
+        amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle,
+                                               AMDSMI_DEV_PERF_LEVEL_AUTO);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset clocks, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_FAN:
+        // reset fans
+        amdsmi_ret = amdsmi_reset_gpu_fan(gpu_handle, 0);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset fans, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_POWER_PROFILE:
+        // reset power profile to bootup default
+        amdsmi_ret = amdsmi_set_gpu_power_profile(gpu_handle, 0,
+                         AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset power profile, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        // also reset perf level to auto
+        amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle,
+                                               AMDSMI_DEV_PERF_LEVEL_AUTO);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset perf level, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_POWER_OVERDRIVE:
+        // get default power overdrive
+        amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0,
+                                               &power_cap_info);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get default power cap,  GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        // set power overdrive to default
+        amdsmi_ret = amdsmi_set_power_cap(gpu_handle, 0,
+                                          power_cap_info.default_power_cap);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set power cap to default, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_XGMI_ERROR:
+        // reset xgmi error status
+        amdsmi_ret = amdsmi_reset_gpu_xgmi_error(gpu_handle);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset xgmi error status, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_PERF_DETERMINISM:
+        // resetting perf level to "auto" resets performance determinism
+        amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle,
+                                               AMDSMI_DEV_PERF_LEVEL_AUTO);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to reset perf level, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+        }
+        break;
+    case AGA_GPU_RESET_TYPE_COMPUTE_PARTITION:
+        // TODO: reset partition not yet support by amd-smi
+        return SDK_RET_OP_NOT_SUPPORTED;
+        break;
+    case AGA_GPU_RESET_TYPE_NPS_MODE:
+        // TODO: reset NPS mode
+        return SDK_RET_OP_NOT_SUPPORTED;
+        break;
+    default:
+        AGA_TRACE_ERR("unknown reset request for GPU {}", gpu_handle);
+        return SDK_RET_INVALID_ARG;
+    }
+
+    return amdsmi_ret_to_sdk_ret(amdsmi_ret);
+}
+
+static sdk_ret_t
+smi_gpu_power_cap_update_ (aga_gpu_handle_t gpu_handle,
+                           aga_gpu_spec_t *spec)
+{
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_power_cap_info_t power_cap_info;
+
+    // 1. get power cap range
+    // 2. validate the power cap is within the range
+    // 3. set power cap
+    // NOTE: power cap 0 indicates reset to default
+
+    // step1: get power cap range
+    amdsmi_ret = amdsmi_get_power_cap_info(gpu_handle, 0, &power_cap_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get power cap, GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+        return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+    }
+    // step2: validate power cap
+    power_cap_info.min_power_cap /= 1000000;
+    power_cap_info.max_power_cap /= 1000000;
+    if ((spec->gpu_power_cap < power_cap_info.min_power_cap) ||
+        (spec->gpu_power_cap > power_cap_info.max_power_cap)) {
+        AGA_TRACE_ERR("Power cap {} is out of supported range, GPU {}, "
+                      "allowed range {}-{}", spec->gpu_power_cap,
+                      gpu_handle, power_cap_info.min_power_cap,
+                      power_cap_info.max_power_cap);
+        return sdk_ret_t(SDK_RET_INVALID_ARG,
+                         ERR_CODE_SMI_GPU_POWER_CAP_OUT_OF_RANGE);
+    }
+    // step3: set power cap
+    amdsmi_ret = amdsmi_set_power_cap(gpu_handle, 0,
+                                      (spec->gpu_power_cap * 1000000));
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to set power cap, GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+        return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_update (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec,
+                uint64_t upd_mask)
+{
+    sdk_ret_t ret;
+    std::ofstream of;
+    std::string dev_path;
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_clk_type_t clock_type;
+    amdsmi_dev_perf_level_t perf_level;
+
+    // performance level has to be set to manual (default is auto) to configure
+    // the following list of attributes to non default values
+    // 1. GPU overdrive level
+    // 2. memory overdirve level
+
+    // set compute partition type; we return after this operation as it doesn't
+    // make sense to update other fields along with compute partition type
+    if (upd_mask & AGA_GPU_UPD_COMPUTE_PARTITION_TYPE) {
+        amdsmi_ret = amdsmi_set_gpu_compute_partition(gpu_handle,
+                         aga_to_smi_gpu_compute_partition_type(
+                             spec->compute_partition_type));
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set GPU compute partition type to {}, "
+                          "GPU {}, err {}", spec->compute_partition_type,
+                          gpu_handle, amdsmi_ret);
+        }
+        return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+    }
+
+    // set memory partition type; we return after this operation as it doesn't
+    // make sense to update other fields along with memory partition type
+    if (upd_mask & AGA_GPU_UPD_MEMORY_PARTITION_TYPE) {
+        amdsmi_ret = amdsmi_set_gpu_memory_partition(gpu_handle,
+                         aga_to_smi_gpu_memory_partition_type(
+                             spec->memory_partition_type));
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set GPU memory partition type to {}, "
+                          "GPU {}, err {}", spec->memory_partition_type,
+                          gpu_handle, amdsmi_ret);
+        }
+        return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+    }
+
+    // set performance level to manual if required
+    if (upd_mask & AGA_GPU_UPD_OVERDRIVE_LEVEL) {
+        amdsmi_ret = amdsmi_get_gpu_perf_level(gpu_handle, &perf_level);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get performance level GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+            return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+        }
+        // if performance level is not manual already, set it to manual
+        if (perf_level != AMDSMI_DEV_PERF_LEVEL_MANUAL) {
+            amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle,
+                             AMDSMI_DEV_PERF_LEVEL_MANUAL);
+            if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+                AGA_TRACE_ERR("Failed to set performance level to manual, "
+                              "GPU {}, err {}", gpu_handle, amdsmi_ret);
+                return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+            }
+        }
+    }
+    // overdrive update
+    if (upd_mask & AGA_GPU_UPD_OVERDRIVE_LEVEL) {
+        amdsmi_ret = amdsmi_set_gpu_overdrive_level(gpu_handle,
+                                                    spec->overdrive_level);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set overdrive level, GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+            return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+        }
+    }
+    // system clock frequence range update
+    if (upd_mask & AGA_GPU_UPD_CLOCK_FREQ_RANGE) {
+        for (uint32_t i = 0; i < AGA_GPU_NUM_CFG_CLOCK_TYPES; i++) {
+            ret = aga_to_smi_gpu_clock_type(spec->clock_freq[i].clock_type,
+                                            &clock_type);
+            if (ret != SDK_RET_OK) {
+                AGA_TRACE_ERR("Invalid clock type {} specified, GPU {}",
+                              spec->clock_freq[i].clock_type, gpu_handle);
+                return SDK_RET_INVALID_ARG;
+            }
+            amdsmi_ret = amdsmi_set_gpu_clk_range(gpu_handle,
+                             spec->clock_freq[i].lo, spec->clock_freq[i].hi,
+                             clock_type);
+            if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+                AGA_TRACE_ERR("Failed to set clock {} frequency range, GPU {}, "
+                              "range {}-{}, err {}",
+                              spec->clock_freq[i].clock_type, gpu_handle,
+                              spec->clock_freq[i].lo, spec->clock_freq[i].hi,
+                              amdsmi_ret);
+                return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+            }
+        }
+    }
+    // power cap update
+    if (upd_mask & AGA_GPU_UPD_POWER_CAP) {
+        ret = smi_gpu_power_cap_update_(gpu_handle, spec);
+        if (ret != SDK_RET_OK) {
+            return ret;
+        }
+    }
+    // performance level update
+    if (upd_mask & AGA_GPU_UPD_PERF_LEVEL) {
+        perf_level = aga_to_smi_gpu_perf_level(spec->perf_level);
+        amdsmi_ret = amdsmi_set_gpu_perf_level(gpu_handle, perf_level);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set performance level to {}, "
+                          "GPU {}, err {}", perf_level, gpu_handle, amdsmi_ret);
+            return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+        }
+    }
+    // fan speed update
+    if (upd_mask & AGA_GPU_UPD_FAN_SPEED) {
+        amdsmi_ret = amdsmi_set_gpu_fan_speed(gpu_handle, 0,
+                                              (int64_t)spec->fan_speed);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to set fan speed to {}, GPU {}, err {}",
+                          spec->fan_speed, gpu_handle, amdsmi_ret);
+            return (amdsmi_ret_to_sdk_ret(amdsmi_ret));
+        }
+    }
+    // TODO: RAS spec update
+    return SDK_RET_OK;
+}
+
+/// \brief  callback function to be used to fill topology information between
+///         two GPUS
+/// \param[in]  obj     GPU object returned by walk function
+/// \param[in]  ctxt    opaque context passed to the callback function
+/// \return false in case walk should continue or true otherwise
+static inline bool
+gpu_topo_walk_cb (void *obj, void *ctxt)
+{
+    gpu_entry *gpu1, *gpu2;
+    amdsmi_status_t amdsmi_ret;
+    static std::string name = "GPU";
+    gpu_topo_walk_ctxt_t *walk_ctxt;
+    aga_device_topology_info_t *info;
+
+    gpu2 = (gpu_entry *)obj;
+    walk_ctxt = (gpu_topo_walk_ctxt_t *)ctxt;
+    gpu1 = walk_ctxt->gpu;
+    info = walk_ctxt->info;
+
+    if (gpu1->handle() != gpu2->handle()) {
+        info->peer_device[walk_ctxt->count].peer_device.type =
+            AGA_DEVICE_TYPE_GPU;
+        strcpy(info->peer_device[walk_ctxt->count].peer_device.name,
+               (name + std::to_string(gpu2->id())).c_str());
+        amdsmi_ret =
+            amdsmi_topo_get_link_type(gpu1->handle(), gpu2->handle(),
+                &info->peer_device[walk_ctxt->count].num_hops,
+                (amdsmi_link_type_t *)
+                     &info->peer_device[walk_ctxt->count].connection.type);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get link type between gpus {} and {}, "
+                          "err {}", gpu1->handle(), gpu2->handle(), amdsmi_ret);
+            // in case of error set num hops to 0xffff and IO link type to
+            // none
+            info->peer_device[walk_ctxt->count].num_hops = 0xffff;
+            info->peer_device[walk_ctxt->count].connection.type =
+                AGA_IO_LINK_TYPE_NONE;
+        }
+        amdsmi_ret = amdsmi_topo_get_link_weight(gpu1->handle(), gpu2->handle(),
+                         &info->peer_device[walk_ctxt->count].link_weight);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get weight for link between gpus {}"
+                          "and {}, err {}", gpu1->handle(), gpu2->handle(),
+                          amdsmi_ret);
+            // in case of error set link weight to 0xffff
+            info->peer_device[walk_ctxt->count].link_weight = 0xffff;
+        }
+        info->peer_device[walk_ctxt->count].valid = true;
+        walk_ctxt->count++;
+    }
+    return false;
+}
+
+sdk_ret_t
+smi_gpu_fill_device_topology (aga_gpu_handle_t gpu_handle,
+                              aga_device_topology_info_t *info)
+{
+    gpu_entry *gpu;
+    gpu_topo_walk_ctxt_t ctxt;
+
+    gpu = gpu_db()->find(gpu_handle);
+    if (gpu == NULL) {
+        AGA_TRACE_ERR("Failed to find GPU {}", gpu_handle);
+        return SDK_RET_ENTRY_NOT_FOUND;
+    }
+
+    ctxt.count = 0;
+    ctxt.info = info;
+    ctxt.gpu = gpu;
+
+    // walk gpu db and fill device topology
+    gpu_db()->walk_handle_db(gpu_topo_walk_cb, &ctxt);
+    return SDK_RET_OK;
+}
+
+/// \brief function to get aga_obj_key_t for a given GPU
+/// \param[in]  gpu_handle  GPU handle
+/// \param[out] key         aga_obj_key_t of the GPU
+static sdk_ret_t
+smi_gpu_uuid_get (aga_gpu_handle_t gpu_handle, aga_obj_key_t *key)
+{
+    amdsmi_status_t status;
+    char uuid_rem[20];
+    char uuid[AMDSMI_GPU_UUID_SIZE];
+    uint32_t uuid_len = AMDSMI_GPU_UUID_SIZE;
+
+    // get uuid from amdsmi
+    status = amdsmi_get_gpu_device_uuid(gpu_handle, &uuid_len, uuid);
+    if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get uuid of GPU {}, err {}",
+                      gpu_handle, status);
+        return amdsmi_ret_to_sdk_ret(status);
+    }
+    // amdsmi returns a string containing the uuid of the GPU (ex:
+    // 2eff74a1-0000-1000-80fe-9cea14a6b148); to derive the aga_obj_key_t from
+    // it we scan the string and construct our aga_obj_key_t
+    sscanf(uuid, "%x-%hx-%hx-%hx-%s", (uint32_t *)&key->id[0],
+           (uint16_t *)&key->id[4], (uint16_t *)&key->id[6],
+           (uint16_t *)&key->id[8], uuid_rem);
+    *(uint32_t *)&key->id[0] = htonl(*(uint32_t *)&key->id[0]);
+    *(uint16_t *)&key->id[4] = htons(*(uint16_t *)&key->id[4]);
+    *(uint16_t *)&key->id[6] = htons(*(uint16_t *)&key->id[6]);
+    *(uint16_t *)&key->id[8] = htons(*(uint16_t *)&key->id[8]);
+    sscanf(uuid_rem, "%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx", &key->id[10],
+           &key->id[11], &key->id[12], &key->id[13], &key->id[14],
+           &key->id[15]);
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles,
+                   aga_obj_key_t *gpu_keys)
+{
+    sdk_ret_t ret;
+    uint32_t num_procs;
+    uint32_t num_sockets;
+    amdsmi_status_t status;
+    processor_type_t proc_type;
+    amdsmi_socket_handle socket_handles[AGA_MAX_SOCKET];
+    aga_gpu_handle_t proc_handles[AGA_MAX_PROCESSORS_PER_SOCKET];
+
+    if (!num_gpus) {
+        return SDK_RET_ERR;
+    }
+    *num_gpus = 0;
+    // get the socket count available in the system
+    status = amdsmi_get_socket_handles(&num_sockets, NULL);
+    if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get number of sockets from amd smi library, "
+                      "err {}", status);
+        return amdsmi_ret_to_sdk_ret(status);
+    }
+    // get the socket handles in the system
+    status = amdsmi_get_socket_handles(&num_sockets, &socket_handles[0]);
+    if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get socket handles from amd smi library, "
+                      "err {}", status);
+        return amdsmi_ret_to_sdk_ret(status);
+    }
+    for (uint32_t i = 0; i < num_sockets; i++) {
+        // for each socket get the number of processors
+        status = amdsmi_get_processor_handles(socket_handles[i],
+                                              &num_procs, NULL);
+        if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get number of processors in socket handle "
+                          "{} from amd smi library, err {}", socket_handles[i],
+                          status);
+            return amdsmi_ret_to_sdk_ret(status);
+        }
+        // for each socket get the processor handles
+        status = amdsmi_get_processor_handles(socket_handles[i],
+                                              &num_procs, &proc_handles[0]);
+        if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get processor handles in socket handle "
+                          "{} from amd smi library, err {}", socket_handles[i],
+                          status);
+            return amdsmi_ret_to_sdk_ret(status);
+        }
+        // get uuids of each GPU
+        for (uint32_t j = 0; j < num_procs; j++) {
+            status = amdsmi_get_processor_type(proc_handles[j], &proc_type);
+            if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
+                AGA_TRACE_ERR("Failed to get processor type of processor {}"
+                              " from amd smi library, err {}", proc_handles[j],
+                              status);
+                return amdsmi_ret_to_sdk_ret(status);
+            }
+            if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
+                gpu_handles[*num_gpus] = proc_handles[j];
+                if (gpu_keys) {
+                    ret = smi_gpu_uuid_get(proc_handles[j],
+                                           &gpu_keys[*num_gpus]);
+                    if (ret != SDK_RET_OK) {
+                        AGA_TRACE_ERR("GPU discovery failed due to error in "
+                                      "getting UUID of GPU {}",
+                                      proc_handles[j]);
+                        return ret;
+                    }
+                }
+                (*num_gpus)++;
+            }
+        }
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec,
+                              aga_gpu_status_t *status)
+{
+    uint64_t value_64;
+    amdsmi_fw_info_t fw_info;
+    amdsmi_status_t amdsmi_ret;
+    amdsmi_vbios_info_t vbios_info;
+    amdsmi_board_info_t board_info;
+    amdsmi_driver_info_t driver_info;
+    amdsmi_virtualization_mode_t mode;
+
+    // fill immutable attributes in spec
+    // fill gpu and memory clock frequencies
+    smi_fill_gpu_clock_frequency_spec_(gpu_handle, spec);
+
+    // fill immutable attributes in status
+    // fill the GPU serial number
+    amdsmi_ret = amdsmi_get_gpu_board_info(gpu_handle, &board_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+       AGA_TRACE_ERR("Failed to get serial number for GPU {}, err {}",
+                     gpu_handle, amdsmi_ret);
+    }
+    // fill the virtualization mode
+    amdsmi_ret = amdsmi_get_gpu_virtualization_mode(gpu_handle, &mode);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get virtualization mode for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        status->virtualization_mode = smi_to_aga_virtualization_mode(mode);
+    }
+    memcpy(status->serial_num, board_info.product_serial, AGA_MAX_STR_LEN);
+    // fill the GPU card series
+    memcpy(status->card_series, board_info.product_name, AGA_MAX_STR_LEN);
+    // fill the GPU vendor information
+    memcpy(status->card_vendor, board_info.manufacturer_name, AGA_MAX_STR_LEN);
+    // fill the GPU card model
+    memcpy(status->card_model, board_info.model_number, AGA_MAX_STR_LEN);
+    // fill the driver version
+    amdsmi_ret = amdsmi_get_gpu_driver_info(gpu_handle, &driver_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get system driver information, GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    }
+    memcpy(status->driver_version, driver_info.driver_version, AGA_MAX_STR_LEN);
+
+    // fill the vbios version
+    amdsmi_ret = amdsmi_get_gpu_vbios_info(gpu_handle, &vbios_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get vbios version for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        strncpy(status->vbios_version, vbios_info.version, AGA_MAX_STR_LEN);
+        strncpy(status->vbios_part_number, vbios_info.part_number,
+                AGA_MAX_STR_LEN);
+        // sku should be retrieved from vbios version
+        gpu_get_sku_from_vbios_(status->card_sku, vbios_info.part_number);
+    }
+    // fill the firmware version
+    amdsmi_ret = amdsmi_get_fw_info(gpu_handle, &fw_info);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get firmware version for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        memset(status->fw_version, 0,
+               sizeof(aga_gpu_fw_version_t) * AGA_GPU_MAX_FIRMWARE_VERSION);
+        for (uint32_t i = 0; i < fw_info.num_fw_info; i++) {
+            fill_gpu_fw_version_(&status->fw_version[i],
+                                 fw_info.fw_info_list[i].fw_id,
+                                 fw_info.fw_info_list[i].fw_version);
+        }
+        status->num_fw_versions = fw_info.num_fw_info;
+    }
+    // fill the memory vendor
+    amdsmi_ret =  amdsmi_get_gpu_vram_vendor(gpu_handle, status->memory_vendor,
+                                             AGA_MAX_STR_LEN);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get memory vendor for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    }
+    // fill vram status
+    smi_fill_vram_status_(gpu_handle, &status->vram_status);
+    // fill GPU BDF
+    amdsmi_ret = amdsmi_get_gpu_bdf_id(gpu_handle, &value_64);
+    if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+        AGA_TRACE_ERR("Failed to get PCIe bus id for GPU {}, err {}",
+                      gpu_handle, amdsmi_ret);
+    } else {
+        // convert PCIe bus to XXXX.XX.XX.X format
+        snprintf(status->pcie_status.pcie_bus_id, AGA_MAX_STR_LEN,
+                 "%04X:%02X:%02X.%X",
+                 ((uint32_t)((value_64 >> 32) & 0xffffffff)),
+                 ((uint32_t)((value_64 >> 8) & 0xff)),
+                 ((uint32_t)((value_64 >> 3) & 0x1f)),
+                 ((uint32_t)(value_64 & 0x7)));
+    }
+    // get energy counter resolution if not already set
+    if (g_energy_counter_resolution == 0.0) {
+        amdsmi_ret = amdsmi_get_energy_count(gpu_handle, &value_64,
+                         &g_energy_counter_resolution, &value_64);
+        if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) {
+            AGA_TRACE_ERR("Failed to get energy count for GPU {}, err {}",
+                          gpu_handle, amdsmi_ret);
+            // in case of failure use the default value
+            g_energy_counter_resolution = AMDSMI_COUNTER_RESOLUTION;
+        }
+    }
+    return SDK_RET_OK;
+}
+
+static inline std::string
+timestamp_string_from_cper_timestamp (amdsmi_cper_timestamp_t *ts)
+{
+    uint32_t full_year;
+    std::ostringstream oss;
+
+    // assuming year is offset from 2000
+    full_year = 2000 + ts->year;
+
+    oss << std::setfill('0') << std::setw(4) << full_year << "-"
+        << std::setw(2) << static_cast<int>(ts->month) << "-"
+        << std::setw(2) << static_cast<int>(ts->day) << " "
+        << std::setw(2) << static_cast<int>(ts->hours) << ":"
+        << std::setw(2) << static_cast<int>(ts->minutes) << ":"
+        << std::setw(2) << static_cast<int>(ts->seconds);
+
+    return oss.str();
+}
+
+sdk_ret_t
+smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle,
+                          aga_cper_severity_t severity, aga_cper_info_t *info)
+{
+    char *cper_data;
+    char *cper_buffer;
+    uint64_t cursor = 0;
+    uint32_t severity_mask;
+    amdsmi_status_t afid_status;
+    uint64_t total_cper_entries = 0;
+    uint64_t buf_size = CPER_BUF_SIZE;
+    uint32_t prev_cper_record_size = 0;
+    uint64_t num_cper_hdr = AGA_GPU_MAX_CPER_ENTRY;
+    amdsmi_status_t status = AMDSMI_STATUS_MORE_DATA;
+    amdsmi_cper_hdr_t *cper_hdrs[AGA_GPU_MAX_CPER_ENTRY];
+
+    // set severity mask
+    switch (severity) {
+    case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED:
+        severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED);
+        break;
+    case AGA_CPER_SEVERITY_FATAL:
+        severity_mask = (1 << AMDSMI_CPER_SEV_FATAL);
+        break;
+    case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED:
+        severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED);
+        break;
+    default:
+        severity_mask = (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED) |
+                        (1 << AMDSMI_CPER_SEV_FATAL)                 |
+                        (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED);
+        break;
+    }
+    // allocate memory for CPER data
+    cper_data = (char *)malloc(buf_size);
+    // cper_buffer is used to keep track of each individual record
+    cper_buffer = cper_data;
+    while (status == AMDSMI_STATUS_MORE_DATA) {
+        // get CPER entries
+        status = amdsmi_get_gpu_cper_entries(gpu_handle, severity_mask,
+                     cper_data, &buf_size, cper_hdrs, &num_cper_hdr, &cursor);
+        if ((status != AMDSMI_STATUS_SUCCESS) &&
+            (status != AMDSMI_STATUS_MORE_DATA)) {
+            AGA_TRACE_ERR("Failed to get CPER entries for GPU {}, err {}",
+                          gpu_handle, status);
+            // free allocated memory
+            free(cper_data);
+            return amdsmi_ret_to_sdk_ret(status);
+        }
+        for (uint64_t i = 0;
+             i < num_cper_hdr && total_cper_entries < AGA_GPU_MAX_CPER_ENTRY;
+             i++, total_cper_entries++) {
+            auto cper_entry = &info->cper_entry[info->num_cper_entry++];
+
+            cper_entry->record_id = std::string(cper_hdrs[i]->record_id);
+            cper_entry->severity =
+                smi_to_aga_cper_severity(cper_hdrs[i]->error_severity);
+            cper_entry->revision = cper_hdrs[i]->revision;
+            if (cper_hdrs[i]->cper_valid_bits.valid_bits.timestamp) {
+                cper_entry->timestamp =
+					timestamp_string_from_cper_timestamp(
+						&cper_hdrs[i]->timestamp);
+            }
+            cper_entry->creator_id = std::string(cper_hdrs[i]->creator_id);
+            cper_entry->notification_type =
+                smi_to_aga_cper_notification_type(cper_hdrs[i]->notify_type);
+            // get AMD field ids from the cper record
+            cper_buffer += prev_cper_record_size;
+            // initialize num_af_id to be the size of the array
+            cper_entry->num_af_id = AGA_GPU_MAX_AF_ID_PER_CPER;
+            afid_status = amdsmi_get_afids_from_cper(cper_buffer,
+                              cper_hdrs[i]->record_length, cper_entry->af_id,
+                              &cper_entry->num_af_id);
+            if (afid_status != AMDSMI_STATUS_SUCCESS) {
+                cper_entry->num_af_id = 0;
+                AGA_TRACE_ERR("Failed to get AMD field id for CPER entry for "
+                              "GPU {}, err {}", gpu_handle, status);
+            }
+            // update prev_cper_record_size
+            prev_cper_record_size = cper_hdrs[i]->record_length;
+        }
+    }
+
+    // free allocated memory
+    free(cper_data);
+    return SDK_RET_OK;
+}
+
+}    // namespace aga
diff --git a/sw/nic/gpuagent/api/smi/smi_api_mock.cc b/sw/nic/gpuagent/api/smi/smi_api_mock.cc
index 1e86361..2b7204f 100644
--- a/sw/nic/gpuagent/api/smi/smi_api_mock.cc
+++ b/sw/nic/gpuagent/api/smi/smi_api_mock.cc
@@ -277,6 +277,37 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
     for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
         stats->usage.gfx_busy_inst[i] = distr(gen) % 100 ;
     }
+    // fill violation stats
+    stats->violation_stats.current_accumulated_counter = 123456 + distr(gen) - distr(gen);
+    stats->violation_stats.processor_hot_residency_accumulated = 23456 + distr(gen) - distr(gen);
+    stats->violation_stats.ppt_residency_accumulated = 34567 + distr(gen) - distr(gen);
+    stats->violation_stats.socket_thermal_residency_accumulated = 45678 + distr(gen) - distr(gen);
+    stats->violation_stats.vr_thermal_residency_accumulated = 56789 + distr(gen) - distr(gen);
+    stats->violation_stats.hbm_thermal_residency_accumulated = 67890 + distr(gen) - distr(gen);
+    stats->violation_stats.processor_hot_residency_percentage = distr(gen) % 100;
+    stats->violation_stats.ppt_residency_percentage = distr(gen) % 100;
+    stats->violation_stats.socket_thermal_residency_percentage = distr(gen) % 100;
+    stats->violation_stats.vr_thermal_residency_percentage = distr(gen) % 100;
+    stats->violation_stats.hbm_thermal_residency_percentage = distr(gen) % 100;
+
+    for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+        stats->violation_stats.gfx_clk_below_host_limit_power_accumulated[i] =
+            1234 + distr(gen) - distr(gen);
+        stats->violation_stats.gfx_clk_below_host_limit_thermal_accumulated[i] =
+            2345 + distr(gen) - distr(gen);
+        stats->violation_stats.gfx_low_utilization_accumulated[i] =
+            3456 + distr(gen) - distr(gen);
+        stats->violation_stats.gfx_clk_below_host_limit_total_accumulated[i] =
+            4567 + distr(gen) - distr(gen);
+        stats->violation_stats.gfx_clk_below_host_limit_power_percentage[i] =
+            distr(gen) % 100;
+        stats->violation_stats.gfx_clk_below_host_limit_thermal_percentage[i] =
+            distr(gen) % 100;
+        stats->violation_stats.gfx_low_utilization_percentage[i] =
+            distr(gen) % 100;
+        stats->violation_stats.gfx_clk_below_host_limit_total_percentage[i] =
+            distr(gen) % 100;
+    }
     return SDK_RET_OK;
 }
 
diff --git a/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig b/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig
new file mode 100644
index 0000000..1e86361
--- /dev/null
+++ b/sw/nic/gpuagent/api/smi/smi_api_mock.cc.orig
@@ -0,0 +1,706 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+//----------------------------------------------------------------------------
+///
+/// \file
+/// smi layer mock API definitions
+///
+//----------------------------------------------------------------------------
+
+#include <random>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include "nic/sdk/include/sdk/base.hpp"
+#include "nic/sdk/lib/event_thread/event_thread.hpp"
+#include "nic/gpuagent/core/aga_core.hpp"
+#include "nic/gpuagent/core/ipc_msg.hpp"
+#include "nic/gpuagent/core/trace.hpp"
+#include "nic/gpuagent/api/aga_state.hpp"
+#include "nic/gpuagent/api/include/aga_gpu.hpp"
+#include "nic/gpuagent/api/include/aga_init.hpp"
+#include "nic/gpuagent/api/smi/smi_api.hpp"
+#include "nic/gpuagent/api/smi/smi_events.hpp"
+#include "nic/gpuagent/api/smi/smi_api_mock_impl.hpp"
+
+/// initial delay (in seconds) after which event monitoring starts
+#define AGA_SMI_EVENT_MONITOR_START_DELAY    10.0
+/// event monitoring frequency (in seconds)
+#define AGA_SMI_EVENT_MONITOR_INTERVAL       3.0
+
+namespace aga {
+
+/// event database indexed by processor handle
+unordered_map<aga_gpu_handle_t, gpu_event_db_entry_t> g_gpu_event_db;
+/// event monitor thread instance
+sdk::event_thread::event_thread *g_event_monitor_thread;
+
+/// \brief    fill clock frequency ranges of the given GPU
+/// \param[in] gpu_handle   GPU handle
+/// \param[out] spec        spec to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_clock_frequency_spec_ (aga_gpu_handle_t gpu_handle,
+                                    aga_gpu_spec_t *spec)
+{
+    // fill sClock spec
+    spec->clock_freq[0].clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM;
+    spec->clock_freq[0].lo = 500;
+    spec->clock_freq[0].hi = 1700;
+    // fill mClock spec
+    spec->clock_freq[1].clock_type = AGA_GPU_CLOCK_TYPE_MEMORY;
+    spec->clock_freq[1].lo = 400;
+    spec->clock_freq[1].hi = 1600;
+    // fill video clock spec
+    spec->clock_freq[2].clock_type = AGA_GPU_CLOCK_TYPE_VIDEO;
+    spec->clock_freq[2].lo = 914;
+    spec->clock_freq[2].hi = 1333;
+    // fill data clock spec
+    spec->clock_freq[3].clock_type = AGA_GPU_CLOCK_TYPE_DATA;
+    spec->clock_freq[3].lo = 711;
+    spec->clock_freq[3].hi = 1143;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_init_immutable_attrs (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec,
+                              aga_gpu_status_t *status)
+{
+    // no need to do anything for mock
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_spec (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec)
+{
+    spec->overdrive_level = 0;
+    spec->perf_level = AGA_GPU_PERF_LEVEL_AUTO;
+
+    // fill gpu and memory clock frequencies
+    smi_fill_gpu_clock_frequency_spec_(gpu_handle, spec);
+    spec->compute_partition_type = AGA_GPU_COMPUTE_PARTITION_TYPE_SPX;
+    return SDK_RET_OK;
+}
+
+/// \brief    fill GPU enumeration ids info using the given GPU
+/// \param[in] gpu_handle    GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_enumeration_id_status_ (aga_gpu_handle_t gpu_handle,
+                                     aga_gpu_status_t *status)
+{
+    status->kfd_id = 58934;
+    status->node_id = 3;
+    status->drm_render_id = 128;
+    status->drm_card_id = 3;
+    return SDK_RET_OK;
+}
+
+/// \brief    fill list of pids using the given GPU
+/// \param[in] gpu_handle GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_gpu_kfd_pid_status_ (aga_gpu_handle_t gpu_handle,
+                              aga_gpu_status_t *status)
+{
+    // TODO: fill kfd pids when this data is available
+    return SDK_RET_OK;
+}
+
+/// \brief      function to format firmware version
+/// \param[out] fw_version    firmware component/version after formatting
+/// \param[in]  block         firmware component name
+/// \param[in]  version       firmware version
+/// \return     none
+static void
+fill_gpu_fw_version_ (aga_gpu_fw_version_t *fw_version, const char *block,
+                      const char *version)
+{
+    strncpy(fw_version->firmware, block, AGA_MAX_STR_LEN);
+    strncpy(fw_version->version, version, AGA_MAX_STR_LEN);
+}
+
+/// \brief    fill supported and current frequencies of system clocks
+/// \param[in] gpu_handle GPU handle
+/// \param[out] status    operational status to be filled
+/// \return SDK_RET_OK or error code in case of failure
+static sdk_ret_t
+smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, aga_gpu_status_t *status)
+{
+    for (uint32_t i = 0; i < AGA_GPU_MAX_CLOCK; i++) {
+        auto clock_status = &status->clock_status[i];
+        if (i < AGA_GPU_GFX_MAX_CLOCK) {
+            // gfx clock
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_SYSTEM;
+            clock_status->frequency = 138 + i;
+            clock_status->locked = (i % 2);
+            clock_status->deep_sleep =
+                (clock_status->frequency <= 140) ? true : false;
+        } else if (i < (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK)) {
+            // memory clock
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_MEMORY;
+            clock_status->frequency = 900;
+            clock_status->locked = false;
+            clock_status->deep_sleep = false;
+        } else if (i < (AGA_GPU_GFX_MAX_CLOCK + AGA_GPU_MEM_MAX_CLOCK +
+                            AGA_GPU_VIDEO_MAX_CLOCK)) {
+            // video clock
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_VIDEO;
+            clock_status->frequency = 29;
+            clock_status->locked = false;
+            clock_status->deep_sleep = true;
+        } else {
+            // data clock
+            clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DATA;
+            clock_status->frequency = 22;
+            clock_status->locked = false;
+            clock_status->deep_sleep = true;
+        }
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_status (aga_gpu_handle_t gpu_handle, uint32_t gpu_id,
+                     aga_gpu_spec_t *spec, aga_gpu_status_t *status)
+{
+    status->index = gpu_id;
+    status->handle = gpu_handle;
+    // fill the GPU serial number
+    strncpy(status->serial_num, "PCB046982-0071", AGA_MAX_STR_LEN);
+    // fill the GPU card series
+    strncpy(status->card_series, "AMD INSTINCT MI300 (MCM) OAM AC MBA MSFT",
+            AGA_MAX_STR_LEN);
+    // fill the GPU card model
+    strncpy(status->card_model, "102-G30211-00", AGA_MAX_STR_LEN);
+    // fill the GPU vendor information
+    strncpy(status->card_vendor, "Advanced Micro Devices, Inc. [AMD/ATI]",
+            AGA_MAX_STR_LEN);
+    // fill the driver version
+    strncpy(status->driver_version, "7.0.0", AGA_MAX_STR_LEN);
+    // fill the vbios part number
+    strncpy(status->vbios_part_number, "113-D65205-107", AGA_MAX_STR_LEN);
+    // fill the vbios version
+    strncpy(status->vbios_version, "022.040.003.041.000001", AGA_MAX_STR_LEN);
+    // fill sku
+    strncpy(status->card_sku, "D65205", AGA_MAX_STR_LEN);
+    // fill the firmware version
+    fill_gpu_fw_version_(&status->fw_version[1], "MEC2", "78");
+    fill_gpu_fw_version_(&status->fw_version[2], "RLC", "17");
+    fill_gpu_fw_version_(&status->fw_version[4], "SDMA2", "8");
+    fill_gpu_fw_version_(&status->fw_version[7], "TA_RAS", "27.00.01.60");
+    fill_gpu_fw_version_(&status->fw_version[8], "TA_XGMI", "32.00.00.19");
+    fill_gpu_fw_version_(&status->fw_version[9], "VCN", "0x0110101b");
+    // fill the memory vendor
+    strncpy(status->memory_vendor, "hynix", AGA_MAX_STR_LEN);
+    smi_fill_clock_status_(gpu_handle, status);
+    // fill the PCIe bus id
+    strncpy(status->pcie_status.pcie_bus_id, "0000:59:00.0", AGA_MAX_STR_LEN);
+    status->pcie_status.slot_type = AGA_PCIE_SLOT_TYPE_OAM;
+    status->pcie_status.width = 16;
+    status->pcie_status.max_width = 16;
+    status->pcie_status.speed = 16;
+    status->pcie_status.max_speed = 32;
+    status->pcie_status.bandwidth = 315;
+    // fill VRAM status
+    status->vram_status.type = AGA_VRAM_TYPE_HBM;
+    strcpy(status->vram_status.vendor, "hynix");
+    status->vram_status.size = 196592;
+    // fill the xgmi error count
+    status->xgmi_status.error_status = AGA_GPU_XGMI_STATUS_NO_ERROR;
+    // fill total memory
+    // fill kfd pid info
+    smi_fill_gpu_kfd_pid_status_(gpu_handle, status);
+    status->partition_id = 0;
+    smi_fill_gpu_enumeration_id_status_(gpu_handle, status);
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
+                    bool partition_capable,
+                    uint32_t partition_id,
+                    aga_gpu_handle_t first_partition_handle,
+                    aga_gpu_stats_t *stats)
+{
+    std::random_device rd; // obtain a random number from hardware
+    std::mt19937 gen(rd()); // seed the generator
+    std::uniform_int_distribution<> distr(0, 90);
+
+    // fill the avg package power
+    stats->avg_package_power = 90 + distr(gen) - distr(gen);
+    // fill the current package power
+    stats->package_power = 90 + distr(gen) - distr(gen);
+    // fill the GPU usage
+    stats->usage.gfx_activity = distr(gen) % 100;
+    // fill VRAM usage
+    stats->vram_usage.total_vram = 196592;
+    stats->vram_usage.used_vram = 1273;
+    stats->vram_usage.free_vram =
+        stats->vram_usage.total_vram - stats->vram_usage.used_vram;
+    stats->vram_usage.total_visible_vram = 196592;
+    stats->vram_usage.used_visible_vram = 1273;
+    stats->vram_usage.free_visible_vram =
+        stats->vram_usage.total_visible_vram -
+            stats->vram_usage.used_visible_vram;
+    stats->vram_usage.total_gtt = 128716;
+    stats->vram_usage.used_gtt = 20;
+    stats->vram_usage.free_gtt =
+        stats->vram_usage.total_gtt - stats->vram_usage.used_gtt;
+    // fill the PCIe stats
+    ++stats->pcie_stats.replay_count;
+    ++stats->pcie_stats.tx_bytes;
+    ++stats->pcie_stats.recovery_count;
+    ++stats->pcie_stats.replay_rollover_count;
+    ++stats->pcie_stats.nack_sent_count;
+    ++stats->pcie_stats.nack_received_count;
+    ++stats->pcie_stats.rx_bytes;
+    ++stats->pcie_stats.tx_bytes;
+    ++stats->pcie_stats.bidir_bandwidth;
+    // fill the energy consumed
+    stats->energy_consumed = 25293978861568 + distr(gen) - distr(gen);
+    for (uint16_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
+        stats->usage.gfx_busy_inst[i] = distr(gen) % 100 ;
+    }
+    return SDK_RET_OK;
+}
+
+typedef struct gpu_event_cb_ctxt_s {
+    aga_event_read_cb_t cb;
+    void *ctxt;
+} gpu_event_cb_ctxt_t;
+
+// generate one event for each GPU
+static inline bool
+gpu_event_read_cb (void *obj, void *ctxt)
+{
+    timespec_t ts;
+    aga_event_t event = {};
+    aga_event_id_t event_id;
+    void *event_buffer = event_get();
+    gpu_entry *gpu = (gpu_entry *)obj;
+    gpu_event_cb_ctxt_t *walk_ctxt = (gpu_event_cb_ctxt_t *)ctxt;
+
+    event_id = event_buffer_get_event_id(event_buffer, 0);
+
+    // get current time
+    clock_gettime(CLOCK_REALTIME, &ts);
+    // fill the event information
+    event.id = event_id;
+    event.timestamp = ts;
+    event.gpu = gpu->key();
+    strncpy(event.message, event_buffer_get_message(event_buffer, 0),
+            AGA_MAX_EVENT_STR);
+    event.message[AGA_MAX_EVENT_STR] = '\0';
+    // call the callback now
+    walk_ctxt->cb(&event, walk_ctxt->ctxt);
+    return false;
+}
+
+sdk_ret_t
+event_read (aga_event_read_cb_t cb, void *ctxt)
+{
+    gpu_event_cb_ctxt_t event_ctxt;
+
+    event_ctxt.cb = cb;
+    event_ctxt.ctxt = ctxt;
+    gpu_db()->walk(gpu_event_read_cb, &event_ctxt);
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_event_read_all (aga_event_read_cb_t cb, void *ctxt)
+{
+    return event_read(cb, ctxt);
+}
+
+sdk_ret_t
+event_monitor_init (void)
+{
+    gpu_event_record_t null_event_record = {};
+
+    // initialize the s/w state
+    for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) {
+        SDK_SPINLOCK_INIT(&g_gpu_event_db[gpu_get_handle(d)].slock,
+                          PTHREAD_PROCESS_SHARED);
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+cleanup_event_listeners (vector<aga_event_listener_info_t>& listeners)
+{
+    aga_event_listener_info_t listener;
+
+    for (auto it = listeners.begin(); it != listeners.end(); it++) {
+        listener = *it;
+
+        // if client context of one gpu is inactive,
+        // we should erase the client context from all gpus
+        // and all events related to this gRPC stream before
+        // waking up the front end, otherwise the client contexts
+        // stored for other gpus for the same subscribe request
+        // will eventually lead to agent crash
+
+        for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) {
+            // lock the event state for this device
+            SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+            for (uint32_t e = (AGA_EVENT_ID_NONE + 1); e <= AGA_EVENT_ID_MAX;
+                 e++) {
+                auto& event_record =
+                    g_gpu_event_db[gpu_get_handle(d)].event_map[(aga_event_id_t)e];
+                // erase the client
+                event_record.client_info.client_set.erase(listener.client_ctxt);
+            }
+            // unlock the event state for this device
+            SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+        }
+        // wakeup the front end thread so it can exit
+        listener.client_ctxt->client_inactive = true;
+        AGA_TRACE_INFO("Signaling frontend gRPC thread to quit, client {}, "
+                       "client ctxt {}, stream {}",
+                       listener.client_ctxt->client.c_str(),
+                       (void *)listener.client_ctxt,
+                       listener.client_ctxt->stream);
+        pthread_cond_signal(&listener.client_ctxt->cond);
+    }
+    return SDK_RET_OK;
+}
+
+static sdk_ret_t
+handle_events (uint32_t num_events, void *event_buffer)
+{
+    sdk_ret_t ret;
+    timespec_t ts;
+    gpu_entry *gpu;
+    aga_gpu_handle_t gpu_handle;
+    aga_event_t event = {};
+    aga_event_id_t event_id;
+    aga_event_client_ctxt_t *client_ctxt;
+    aga_event_listener_info_t inactive_listener;
+    vector<aga_event_listener_info_t> inactive_listeners;
+
+    // get current time
+    clock_gettime(CLOCK_REALTIME, &ts);
+    // start processing all the events
+    for (uint32_t i = 0; i < num_events; i++) {
+        gpu_handle = event_buffer_get_gpu_handle(event_buffer, i);
+        gpu = gpu_db()->find(gpu_handle);
+        if (gpu == NULL) {
+            continue;
+        }
+        event_id = event_buffer_get_event_id(event_buffer, i);
+        auto& event_map = g_gpu_event_db[gpu_handle].event_map;
+
+        // lock the event state for this device
+        SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_handle].slock);
+        // update our event state
+        auto& event_record = event_map[event_id];
+        event_record.timestamp = ts;
+        strncpy(event_record.message, event_buffer_get_message(event_buffer, i),
+                AGA_MAX_EVENT_STR);
+        event_record.message[AGA_MAX_EVENT_STR] = '\0';
+        // fill the event record
+        event.id = event_id;
+        event.timestamp = ts;
+        event.gpu = gpu->key();
+        strncpy(event.message, event_buffer_get_message(event_buffer, i),
+                AGA_MAX_EVENT_STR);
+        event.message[AGA_MAX_EVENT_STR] = '\0';
+        // walk thru all the clients that are interested in this event and
+        // notify them
+        for (auto client_set_it = event_record.client_info.client_set.begin();
+             client_set_it != event_record.client_info.client_set.end();
+             client_set_it++) {
+             client_ctxt = *client_set_it;
+            // invoke the event notification callback
+            ret = client_ctxt->notify_cb(&event, *client_set_it);
+            if (unlikely(ret != SDK_RET_OK)) {
+                // add to list of clients not reachable
+                inactive_listener.gpu_id = gpu->id();
+                inactive_listener.event = event_id;
+                inactive_listener.client_ctxt = *client_set_it;
+                inactive_listeners.push_back(inactive_listener);
+            }
+        }
+        // unlock the event state maintained for this device
+        SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_handle].slock);
+    }
+    // handle all the dead clients now
+    cleanup_event_listeners(inactive_listeners);
+    return SDK_RET_OK;
+}
+
+static void
+event_monitor_timer_cb (sdk::event_thread::timer_t *timer)
+{
+    // handle all the events
+    handle_events(1, event_get());
+}
+
+/// \brief process an event subscribe request from client
+/// \param[in] req    pointer to incoming request
+/// \return SDK_RET_OK if success or error code in case of failure
+sdk_ret_t
+process_event_subscribe_req (aga_event_subscribe_args_t *req)
+{
+    gpu_event_record_t event_record = {};
+
+    for (size_t i = 0; i < req->events.size(); i++) {
+        AGA_TRACE_DEBUG("Rcvd event {} subscribe request, client {}, "
+                        "client ctxt {}, stream {}",  req->events[i],
+                        req->client_ctxt->client.c_str(),
+                        (void *)req->client_ctxt,
+                        (void *)req->client_ctxt->stream);
+        for (size_t g = 0; g < req->gpu_ids.size(); g++) {
+            uint32_t d = req->gpu_ids[g];
+            auto& event_map = g_gpu_event_db[gpu_get_handle(d)].event_map;
+
+            // lock the event map for this device
+            SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+            // check if this event was of interest to any client or happened
+            // already
+            auto event_map_it = event_map.find(req->events[i]);
+            if (event_map_it == event_map.end()) {
+                // 1st time anyone is subscribing to this event
+                event_record.client_info.client_set.insert(req->client_ctxt);
+                event_map[req->events[i]] = event_record;
+            } else {
+                // atleast one client is already interested in this event, check
+                // if this particular client already subscribed to this event
+                auto set_it = event_map_it->second.client_info.client_set.find(
+                                                       req->client_ctxt);
+                if (set_it ==
+                        event_map_it->second.client_info.client_set.end()) {
+                    // this client is a new listener for this event
+                    event_map_it->second.client_info.client_set.insert(
+                                                         req->client_ctxt);
+                } else {
+                    // this client is already subscribed to this event
+                }
+            }
+            // unlock the event map for this device
+            SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+        }
+    }
+    return SDK_RET_OK;
+}
+
+/// \brief callback function to process IPC msg from gRPC thread
+///        to handle event subscription requests
+/// \param[in] msg    received IPC message
+/// \param[in] ctxt   opaque context (used when callback was registered)
+static void
+event_subscribe_ipc_cb (sdk::ipc::ipc_msg_ptr msg, const void *ctxt)
+{
+    sdk_ret_t ret;
+    aga_event_subscribe_args_t *req;
+
+    req = *(aga_event_subscribe_args_t **)msg->data();
+    if (req == NULL) {
+        AGA_TRACE_ERR("Ignoring NULL event subscribe request received");
+        return;
+    }
+    ret = process_event_subscribe_req(req);
+    sdk::ipc::respond(msg, &ret, sizeof(ret));
+}
+
+static void
+event_monitor_thread_init (void *ctxt)
+{
+    static sdk::event_thread::timer_t event_monitor_timer;
+
+    // initialize event monitoring state
+    event_monitor_init();
+    // subscribe to all IPC msgs of interest
+    sdk::ipc::reg_request_handler(AGA_IPC_MSG_ID_EVENT_SUBSCRIBE,
+                                  event_subscribe_ipc_cb, NULL);
+    // start event monitoring timer
+    sdk::event_thread::timer_init(&event_monitor_timer, event_monitor_timer_cb,
+                                  AGA_SMI_EVENT_MONITOR_START_DELAY,
+                                  AGA_SMI_EVENT_MONITOR_INTERVAL);
+    sdk::event_thread::timer_start(&event_monitor_timer);
+}
+
+static void
+event_monitor_thread_exit (void *ctxt)
+{
+    // cleanup the event state
+    for (uint32_t d = 0; d < AGA_MOCK_NUM_GPU; d++) {
+        SDK_SPINLOCK_LOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+        g_gpu_event_db[gpu_get_handle(d)].event_map.clear();
+        SDK_SPINLOCK_UNLOCK(&g_gpu_event_db[gpu_get_handle(d)].slock);
+    }
+}
+
+sdk_ret_t
+spawn_event_monitor_thread (void)
+{
+    g_event_monitor_thread =
+        sdk::event_thread::event_thread::factory(
+            "event-monitor", AGA_THREAD_ID_EVENT_MONITOR,
+            sdk::lib::THREAD_ROLE_CONTROL, 0x0, event_monitor_thread_init,
+            event_monitor_thread_exit, NULL, // message
+            sdk::lib::thread::priority_by_role(sdk::lib::THREAD_ROLE_CONTROL),
+            sdk::lib::thread::sched_policy_by_role(sdk::lib::THREAD_ROLE_CONTROL),
+            (THREAD_YIELD_ENABLE | THREAD_SYNC_IPC_ENABLE));
+    SDK_ASSERT_TRACE_RETURN((g_event_monitor_thread != NULL), SDK_RET_ERR,
+                            "GPU event monitor thread create failure");
+    g_event_monitor_thread->start(NULL);
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_init (aga_api_init_params_t *init_params)
+{
+    // spawn event monitor thread
+    spawn_event_monitor_thread();
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_reset (aga_gpu_handle_t gpu_handle, aga_gpu_reset_type_t reset_type)
+{
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_update (aga_gpu_handle_t gpu_handle, aga_gpu_spec_t *spec,
+                uint64_t upd_mask)
+{
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_fill_device_topology (aga_gpu_handle_t gpu_handle,
+                              aga_device_topology_info_t *info)
+{
+    uint32_t gpu_id;
+    uint32_t cnt = 0;
+    static std::string name = "GPU";
+
+    // get linear GPU index from device name
+    sscanf(info->device.name, "GPU%u", &gpu_id);
+    for (uint32_t i = 0; i < AGA_MOCK_NUM_GPU; i++) {
+        if (gpu_handle != gpu_get_handle(i)) {
+            info->peer_device[cnt].peer_device.type = AGA_DEVICE_TYPE_GPU;
+            strcpy(info->peer_device[cnt].peer_device.name,
+                   (name + std::to_string(i)).c_str());
+            info->peer_device[cnt].num_hops = 1;
+            info->peer_device[cnt].connection.type = AGA_IO_LINK_TYPE_XGMI;
+            info->peer_device[cnt].link_weight = 15 + (15 * ((i + gpu_id) % 5));
+            info->peer_device[cnt].valid = true;
+            cnt++;
+        }
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_get_gpu_partition_id (aga_gpu_handle_t gpu_handle, uint32_t *partition_id)
+{
+    *partition_id = 0;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_get_gpu_virtualization_mode (aga_gpu_handle_t gpu_handle,
+                                 aga_gpu_virtualization_mode_t *mode)
+{
+    *mode = AGA_VIRTUALIZATION_MODE_BAREMETAL;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_get_gpu_partition_info (aga_gpu_handle_t gpu_handle, bool *capable,
+                            aga_gpu_compute_partition_type_t *compute_partition,
+                            aga_gpu_memory_partition_type_t *memory_partition)
+{
+    *capable = true;
+    *compute_partition = AGA_GPU_COMPUTE_PARTITION_TYPE_SPX;
+    *memory_partition = AGA_GPU_MEMORY_PARTITION_TYPE_NPS1;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_discover_gpus (uint32_t *num_gpus, aga_gpu_handle_t *gpu_handles,
+                   aga_obj_key_t *gpu_keys)
+{
+    if (!num_gpus) {
+        return SDK_RET_ERR;
+    }
+    *num_gpus = AGA_MOCK_NUM_GPU;
+    for (uint32_t i = 0; i < *num_gpus; i++) {
+        gpu_handles[i] = gpu_get_handle(i);
+    }
+    if (gpu_keys) {
+        for (uint32_t i = 0; i < *num_gpus; i++) {
+            gpu_keys[i] = gpu_uuid(i, gpu_get_unique_id(i));
+        }
+    }
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_get_bad_page_count (void *gpu_obj,
+                            uint32_t *num_bad_pages)
+{
+    *num_bad_pages = 1;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_get_bad_page_records (void *gpu_obj,
+                              uint32_t num_bad_pages,
+                              aga_gpu_bad_page_record_t *records)
+{
+    gpu_entry *gpu = (gpu_entry *)gpu_obj;
+
+    records[0].key = gpu->key();
+    records[0].page_address = 0x5c70ec;
+    records[0].page_size = 4096;
+    records[0].page_status = AGA_GPU_PAGE_STATUS_UNRESERVABLE;
+    return SDK_RET_OK;
+}
+
+sdk_ret_t
+smi_gpu_get_cper_entries (aga_gpu_handle_t gpu_handle,
+                          aga_cper_severity_t severity, aga_cper_info_t *info)
+{
+    uint64_t gpu_key;
+    std::ostringstream oss;
+    auto cper_entry = &info->cper_entry[info->num_cper_entry++];
+
+    gpu_key = (uint64_t)gpu_handle;
+    oss << (gpu_key % 8) + 1 << ":" << (gpu_key+ 5) % 8 + 1;
+    cper_entry->record_id = oss.str();
+    cper_entry->severity = AGA_CPER_SEVERITY_FATAL;
+    cper_entry->revision = 256;
+
+    oss.str("");
+    oss << std::setfill('0') << "2025-09-" << std::setw(2) <<
+        (gpu_key % 31) + 1 << " 15:00:" << std::setw(2) << (gpu_key % 60) + 1;
+    cper_entry->timestamp = oss.str();
+    cper_entry->notification_type = AGA_CPER_NOTIFICATION_TYPE_MCE;
+    cper_entry->creator_id = "amdgpu";
+    cper_entry->num_af_id = 1;
+    cper_entry->af_id[0] = 30;
+    return SDK_RET_OK;
+}
+
+}    // namespace aga
diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go b/sw/nic/gpuagent/cli/cmd/gpu.go
index a3f65e0..9644c83 100644
--- a/sw/nic/gpuagent/cli/cmd/gpu.go
+++ b/sw/nic/gpuagent/cli/cmd/gpu.go
@@ -1098,6 +1098,20 @@ func printUsageHdr(indent string) {
 	}
 }
 
+func printViolationAccumulatedHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "GPU GFX clock host limit accumulated:\n")
+		printHdr = true
+	}
+}
+
+func printViolationPercentageHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "GPU GFX clock host limit percentage:\n")
+		printHdr = true
+	}
+}
+
 func printGPUStats(gpu *aga.GPU, statsOnly bool) {
 	var indent string
 	spec := gpu.GetSpec()
@@ -1650,6 +1664,7 @@ func printGPUStats(gpu *aga.GPU, statsOnly bool) {
 		}
 	}
 	if stats.GetViolationStats() != nil {
+		printHdr = false
 		vStats := stats.GetViolationStats()
 		if vStats.GetCurrentAccumulatedCounter() != UINT64_MAX_VAL {
 			fmt.Printf(indent+"%-38s : %d\n", "Current accumulated counter",
@@ -1679,6 +1694,168 @@ func printGPUStats(gpu *aga.GPU, statsOnly bool) {
 				"HBM thermal residency accumulated",
 				vStats.GetHBMThermalResidencyAccumulated())
 		}
+		if vStats.GetProcessorHotResidencyPercentage() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d%%\n",
+				"Processor hot residency percentage",
+				vStats.GetProcessorHotResidencyPercentage())
+		}
+		if vStats.GetPPTResidencyPercentage() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d%%\n",
+				"PPT residency percentage",
+				vStats.GetPPTResidencyPercentage())
+		}
+		if vStats.GetSocketThermalResidencyPercentage() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d%%\n",
+				"Socket thermal residency percentage",
+				vStats.GetSocketThermalResidencyPercentage())
+		}
+		if vStats.GetVRThermalResidencyPercentage() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d%%\n",
+				"VR thermal residency percentage",
+				vStats.GetVRThermalResidencyPercentage())
+		}
+		if vStats.GetHBMThermalResidencyPercentage() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d%%\n",
+				"HBM thermal residency percentage",
+				vStats.GetHBMThermalResidencyPercentage())
+		}
+		validEntry := false
+		gStr := fmt.Sprintf("  %-36s : ", "Power")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitPowerAccumulated() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationAccumulatedHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Thermal")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTHMAccumulated() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationAccumulatedHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Low Utilization")
+		for _, gfx := range stats.GetViolationStats().GetGFXLowUtilizationAccumulated() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationAccumulatedHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Total")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTotalAccumulated() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationAccumulatedHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Power")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitPowerPercentage() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL || gfx > 100 {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				printHdr = false
+				gStr = fmt.Sprintf("%s%d%% ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationPercentageHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Thermal")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTHMPercentage() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL || gfx > 100 {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d%% ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationPercentageHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Low Utilization")
+		for _, gfx := range stats.GetViolationStats().GetGFXLowUtilizationPercentage() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL || gfx > 100 {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d%% ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationPercentageHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+
+		gStr = fmt.Sprintf("  %-36s : ", "Total")
+		for _, gfx := range stats.GetViolationStats().GetGFXBelowHostLimitTotalPercentage() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT64_MAX_VAL || gfx > 100 {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d%% ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printViolationPercentageHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
 	}
 
 	fmt.Printf("\n%s\n", strings.Repeat("-", 80))
diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go.orig b/sw/nic/gpuagent/cli/cmd/gpu.go.orig
new file mode 100644
index 0000000..a3f65e0
--- /dev/null
+++ b/sw/nic/gpuagent/cli/cmd/gpu.go.orig
@@ -0,0 +1,2034 @@
+//
+// Copyright(C) Advanced Micro Devices, Inc. All rights reserved.
+//
+// You may not use this software and documentation (if any) (collectively,
+// the "Materials") except in compliance with the terms and conditions of
+// the Software License Agreement included with the Materials or otherwise as
+// set forth in writing and signed by you and an authorized signatory of AMD.
+// If you do not have a copy of the Software License Agreement, contact your
+// AMD representative for a copy.
+//
+// You agree that you will not reverse engineer or decompile the Materials,
+// in whole or in part, except as allowed by applicable law.
+//
+// THE MATERIALS ARE DISTRIBUTED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR
+// REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+//
+
+//------------------------------------------------------------------------------
+///
+/// \file
+/// gpctl command line interface for gpu protobufs
+///
+//------------------------------------------------------------------------------
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+
+	uuid "github.com/satori/go.uuid"
+	"github.com/spf13/cobra"
+	yaml "gopkg.in/yaml.v2"
+
+	"github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils"
+	aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go"
+)
+
+var (
+	gpuID               string
+	gpuAdminState       string
+	overDriveLevel      uint32
+	powerCap            uint64
+	perfLevel           string
+	gpuClkFreq          string
+	memClkFreq          string
+	fanSpeed            uint64
+	gpuAdminStateVal    aga.GPUAdminState
+	PerformanceLevelVal aga.GPUPerformanceLevel
+	clockType           aga.GPUClockType
+	memPartition        string
+	memPartitionVal     aga.GPUMemoryPartitionType
+	computePartition    string
+	computePartitionVal aga.GPUComputePartitionType
+	gpuClkType          string
+	gpuClkFreqLo        uint32
+	gpuClkFreqHi        uint32
+	memClkFreqLo        uint32
+	memClkFreqHi        uint32
+	printHdr            bool
+	severity            string
+)
+
+const (
+	UINT16_MAX_VAL_UINT16 uint16  = 0xffff
+	UINT16_MAX_VAL_UINT32 uint32  = 0xffff
+	UINT16_MAX_VAL_UINT64 uint64  = 0xffff
+	UINT32_MAX_VAL_UINT32 uint32  = 0xffffffff
+	UINT32_MAX_VAL_UINT64 uint64  = 0xffffffff
+	UINT64_MAX_VAL        uint64  = 0xffffffffffffffff
+	FLOAT32_INVALID_VAL   float32 = 65535.0
+)
+
+var gpuShowCmd = &cobra.Command{
+	Use:   "gpu",
+	Short: "show GPU information",
+	Long:  "show GPU information",
+	RunE:  gpuShowCmdHandler,
+}
+
+var gpuAllShowCmd = &cobra.Command{
+	Use:   "all",
+	Short: "show all GPU object",
+	Long:  "show all GPU object",
+	RunE:  gpuAllShowCmdHandler,
+}
+
+var gpuPartitionsShowCmd = &cobra.Command{
+	Use:   "compute-partition",
+	Short: "show physical GPU's compute partitions",
+	Long:  "show physical GPU's compute partitions",
+	RunE:  gpuPartitionsShowCmdHandler,
+}
+
+var gpuBadPageShowCmd = &cobra.Command{
+	Use:   "bad-page",
+	Short: "show GPU bad page information",
+	Long:  "show GPU bad page information",
+	RunE:  gpuBadPageShowCmdHandler,
+}
+
+var gpuCPERShowCmd = &cobra.Command{
+	Use:   "cper-records",
+	Short: "show GPU CPER records",
+	Long:  "show GPU CPER information",
+	RunE:  gpuCPERShowCmdHandler,
+}
+
+var gpuStatsShowCmd = &cobra.Command{
+	Use:   "statistics",
+	Short: "show GPU statistics",
+	Long:  "show GPU statistics",
+	RunE:  gpuStatsShowCmdHandler,
+}
+
+var gpuUpdateCmd = &cobra.Command{
+	Use:     "gpu",
+	Short:   "update gpu object",
+	Long:    "update gpu object",
+	PreRunE: gpuUpdateCmdPreRunE,
+	RunE:    gpuUpdateCmdHandler,
+}
+
+var gpuResetCmd = &cobra.Command{
+	Use:     "reset",
+	Short:   "reset gpu object/settings",
+	Long:    "reset gpu object/settings",
+	PreRunE: gpuResetCmdPreRunE,
+	RunE:    gpuResetCmdHandler,
+}
+
+func init() {
+	ShowCmd.AddCommand(gpuShowCmd)
+	gpuShowCmd.Flags().BoolP("yaml", "y", false, "Output in yaml")
+	gpuShowCmd.Flags().BoolP("json", "j", false, "Output in json")
+	gpuShowCmd.Flags().BoolP("status", "s", false, "Show GPU status")
+	gpuShowCmd.Flags().Bool("summary", false, "Display number of objects")
+	gpuShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id")
+	gpuShowCmd.Flags().BoolP("partitioned", "p", false,
+		"Show only partitioned GPUs")
+
+	gpuShowCmd.AddCommand(gpuAllShowCmd)
+	gpuAllShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id")
+
+	gpuShowCmd.AddCommand(gpuStatsShowCmd)
+	gpuStatsShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id")
+
+	gpuShowCmd.AddCommand(gpuPartitionsShowCmd)
+	gpuPartitionsShowCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify partitioned GPU's id")
+	gpuPartitionsShowCmd.Flags().BoolP("yaml", "y", false, "Output in yaml")
+	gpuPartitionsShowCmd.Flags().BoolP("json", "j", false, "Output in json")
+
+	gpuShowCmd.AddCommand(gpuBadPageShowCmd)
+	gpuBadPageShowCmd.Flags().StringVarP(&gpuID, "id", "i", "",
+		"Specify GPU id")
+
+	gpuShowCmd.AddCommand(gpuCPERShowCmd)
+	gpuCPERShowCmd.Flags().StringVarP(&gpuID, "id", "i", "",
+		"Specify GPU id")
+	gpuCPERShowCmd.Flags().StringVarP(&severity, "severity", "s", "all",
+		"Specify CPER severity (\"fatal\", \"non-fatal-uncorrected\", "+
+			"\"non-fatal-corrected\" or \"all\")")
+	gpuCPERShowCmd.Flags().BoolP("json", "j", false, "Output in json")
+
+	DebugUpdateCmd.AddCommand(gpuUpdateCmd)
+	gpuUpdateCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id")
+	gpuUpdateCmd.Flags().StringVarP(&gpuAdminState, "admin-state", "a", "",
+		"Specify admin state (up/down)")
+	gpuUpdateCmd.Flags().Uint32VarP(&overDriveLevel, "overdrive-level", "o", 0,
+		"Specify GPU clock overdrive level in percentage")
+	gpuUpdateCmd.Flags().Uint64VarP(&powerCap, "power-cap", "p", 0,
+		"Specify max package power GPU can consume (in Watts)")
+	gpuUpdateCmd.Flags().StringVarP(&perfLevel, "perf-level", "l", "",
+		"Specify GPU performance level (none/auto/low/high/deterministic/"+
+			"memclock/sysclock/manual)")
+	gpuUpdateCmd.Flags().StringVarP(&gpuClkType, "clock-type", "t", "",
+		"Specify GPU clock type (memory, system, video or data)")
+	gpuUpdateCmd.Flags().StringVarP(&gpuClkFreq, "clock-frequency", "c", "",
+		"Specify GPU clock frequency range (lo-hi)")
+	gpuUpdateCmd.Flags().StringVarP(&memPartition, "memory-partition", "m", "",
+		"Specify GPU memory partition type (NPS1, NPS2, NPS4, NPS8)")
+	gpuUpdateCmd.Flags().StringVarP(&computePartition, "compute-partition", "",
+		"", "Specify GPU compute partition type (SPX, DPX, TPX, QPX, CPX)")
+	gpuUpdateCmd.Flags().Uint64VarP(&fanSpeed, "fan-speed", "s", 0,
+		"Specify fan speed")
+	gpuUpdateCmd.MarkFlagRequired("id")
+	// TODO: RAS spec
+
+	gpuUpdateCmd.AddCommand(gpuResetCmd)
+	gpuResetCmd.Flags().StringVarP(&gpuID, "id", "i", "", "Specify GPU id")
+	gpuResetCmd.Flags().Bool("clocks", false,
+		"Reset clocks and overdrive to default")
+	gpuResetCmd.Flags().Bool("fans", false, "Reset fans to automatic control")
+	gpuResetCmd.Flags().Bool("power-profile", false,
+		"Reset power profile to default")
+	gpuResetCmd.Flags().Bool("power-overdrive", false,
+		"Set the maximum GPU power back to the device deafult state")
+	gpuResetCmd.Flags().Bool("xgmi-error", false,
+		"Reset XGMI error status/count")
+	gpuResetCmd.Flags().Bool("perf-determinism", false,
+		"Disable performance determinism")
+	gpuResetCmd.Flags().Bool("compute-partition", false,
+		"Resets to boot compute partition state")
+	gpuResetCmd.Flags().Bool("nps-mode", false, "Reset to boot NPS mode state")
+	gpuResetCmd.MarkFlagRequired("id")
+}
+
+func printGPUPartitions(resp *aga.GPUComputePartition) {
+	fmt.Printf("%-40s%-16s", utils.IdToStr(resp.GetId()),
+		strings.Replace(resp.GetPartitionType().String(),
+			"GPU_COMPUTE_PARTITION_TYPE_", "", -1))
+
+	for i, partition := range resp.GPUPartition {
+		if i != 0 {
+			fmt.Printf("%-56s%-40s\n", "", utils.IdToStr(partition))
+		} else {
+			fmt.Printf("%-40s\n", "", utils.IdToStr(partition))
+		}
+	}
+}
+
+type ShadowGPUComputePartition struct {
+	Id            string
+	PartitionType aga.GPUComputePartitionType
+	GPUPartition  []string
+}
+
+func NewGPUComputePartition(resp *aga.GPUComputePartition) *ShadowGPUComputePartition {
+	var gpuPartitions []string
+	for _, child := range resp.GetGPUPartition() {
+		gpuPartitions = append(gpuPartitions, utils.IdToStr(child))
+	}
+	return &ShadowGPUComputePartition{
+		Id:            utils.IdToStr(resp.GetId()),
+		PartitionType: resp.GetPartitionType(),
+		GPUPartition:  gpuPartitions,
+	}
+}
+
+func printGPUPartitionsJson(resp *aga.GPUComputePartition) {
+	partition := NewGPUComputePartition(resp)
+	b, _ := json.MarshalIndent(partition, "  ", "  ")
+	fmt.Printf("  %s", string(b))
+}
+
+func gpuPartitionsShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	respMsg := &aga.GPUComputePartitionGetResponse{}
+	var req *aga.GPUComputePartitionGetRequest
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUComputePartitionGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get all GPUs
+		req = &aga.GPUComputePartitionGetRequest{
+			Id: [][]byte{},
+		}
+	}
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUComputePartitionGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU failed, err %v", err)
+	}
+
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus)
+	}
+
+	// print GPUs
+	if cmd != nil && cmd.Flags().Changed("yaml") {
+		yamlArr, _ := yaml.Marshal(respMsg.Response)
+		fmt.Println(string(yamlArr))
+	} else if cmd != nil && cmd.Flags().Changed("json") {
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("[\n")
+		}
+		rcvdResp := false
+		for _, resp := range respMsg.Response {
+			if rcvdResp == true {
+				// json output requires a , after each GPU
+				fmt.Printf(",\n")
+			}
+			printGPUPartitionsJson(resp)
+			rcvdResp = true
+		}
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("\n]\n")
+		}
+	} else {
+		hdrLine := strings.Repeat("-", 96)
+		fmt.Println(hdrLine)
+		fmt.Printf("%-40s%-16s%-40s\n", "PhysicalGPU", "PartitionType",
+			"GPUPartitions")
+		fmt.Println(hdrLine)
+		for _, resp := range respMsg.Response {
+			printGPUPartitions(resp)
+		}
+	}
+	return nil
+}
+
+type ShadowGPUCPEREntry struct {
+	GPU       string
+	CPEREntry []*aga.CPEREntry
+}
+
+func NewCPER(cper *aga.GPUCPEREntry) *ShadowGPUCPEREntry {
+	return &ShadowGPUCPEREntry{
+		GPU:       utils.IdToStr(cper.GetGPU()),
+		CPEREntry: cper.GetCPEREntry(),
+	}
+}
+
+func printGPUCPEREntryJson(cper *aga.GPUCPEREntry) {
+	b, _ := json.MarshalIndent(NewCPER(cper), "  ", "  ")
+	fmt.Printf("  %s", string(b))
+}
+
+func gpuCPERShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	respMsg := &aga.GPUCPERGetResponse{}
+	var req *aga.GPUCPERGetRequest
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUCPERGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get all GPUs
+		req = &aga.GPUCPERGetRequest{
+			Id: [][]byte{},
+		}
+	}
+	switch strings.ToLower(severity) {
+	case "all":
+		req.Severity = aga.CPERSeverity_CPER_SEVERITY_NONE
+	case "non-fatal-uncorrected":
+		req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_UNCORRECTED
+	case "fatal":
+		req.Severity = aga.CPERSeverity_CPER_SEVERITY_FATAL
+	case "non-fatal-corrected":
+		req.Severity = aga.CPERSeverity_CPER_SEVERITY_NON_FATAL_CORRECTED
+	default:
+		return fmt.Errorf("Invalid value specified for \"--severity\"")
+	}
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUCPERGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU CPER failed, err %v", err)
+	}
+
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus)
+	}
+
+	// print CPER information
+	if cmd != nil && cmd.Flags().Changed("json") {
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("[\n")
+		}
+		rcvdResp := false
+		for _, cper := range respMsg.CPER {
+			if rcvdResp == true {
+				// json output requires a , after each GPU
+				fmt.Printf(",\n")
+			}
+			printGPUCPEREntryJson(cper)
+			rcvdResp = true
+		}
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("\n]\n")
+		}
+	} else {
+		hdrLine := strings.Repeat("-", 156)
+		fmt.Println(hdrLine)
+		fmt.Printf("%-20s%-40s%-16s%-25s%-10s%-10s%-15s%-20s\n",
+			"Timestamp", "GPU", "RecordId", "Severity", "Revision", "CreatorId",
+			"NtfnType", "AMDFieldId")
+		fmt.Println(hdrLine)
+		for _, cper := range respMsg.CPER {
+			gpuStr := utils.IdToStr(cper.GetGPU())
+			for _, entry := range cper.GetCPEREntry() {
+				severityStr := strings.Replace(entry.GetSeverity().String(),
+					"CPER_SEVERITY_", "", -1)
+				ntfnTypeStr :=
+					strings.Replace(entry.GetNotificationType().String(),
+						"CPER_NOTIFICATION_TYPE_", "", -1)
+				ntfnTypeStr = strings.Replace(ntfnTypeStr, "_", "-", -1)
+
+				var afIdBuilder strings.Builder
+				indent := strings.Repeat(" ", 121)
+				for i, afId := range entry.GetAFId() {
+					afIdBuilder.WriteString(strconv.FormatUint(afId, 10))
+					if (i+1)%3 == 0 {
+						if i != len(entry.GetAFId())-1 {
+							afIdBuilder.WriteString("\n" + indent)
+						}
+					} else if i != len(entry.GetAFId())-1 {
+						afIdBuilder.WriteString(", ")
+					}
+				}
+				afIdStr := afIdBuilder.String()
+				fmt.Printf("%-20s%-40s%-16s%-25s%-10d%-10s%-15s%-20s\n",
+					entry.GetTimestamp(), gpuStr, entry.GetRecordId(),
+					severityStr, entry.GetRevision(), entry.GetCreatorId(),
+					ntfnTypeStr, afIdStr)
+			}
+		}
+	}
+	return nil
+}
+func gpuShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	respMsg := &aga.GPUGetResponse{}
+	var req *aga.GPUGetRequest
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get all GPUs
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{},
+		}
+	}
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU failed, err %v", err)
+	}
+
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus)
+	}
+
+	var response []*aga.GPU
+	for _, resp := range respMsg.Response {
+		status := resp.GetStatus()
+		if len(status.GetGPUPartition()) > 0 {
+			if cmd != nil && cmd.Flags().Changed("partitioned") {
+				response = append(response, resp)
+			}
+		} else {
+			if cmd == nil || !cmd.Flags().Changed("partitioned") {
+				response = append(response, resp)
+			}
+		}
+	}
+
+	// print GPUs
+	if cmd != nil && cmd.Flags().Changed("yaml") {
+		yamlArr, _ := yaml.Marshal(response)
+		fmt.Println(string(yamlArr))
+	} else if cmd != nil && cmd.Flags().Changed("json") {
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("[\n")
+		}
+		rcvdResp := false
+		for _, resp := range response {
+			if rcvdResp == true {
+				// json output requires a , after each GPU
+				fmt.Printf(",\n")
+			}
+			printGPUJson(resp)
+			rcvdResp = true
+		}
+		// json output requires that all GPUs are listed within [] braces
+		if cmd.Flags().Changed("json") {
+			fmt.Printf("\n]\n")
+		}
+	} else if cmd != nil && cmd.Flags().Changed("summary") {
+		printGPUSummary(len(response))
+	} else if cmd != nil && cmd.Flags().Changed("status") {
+		for _, resp := range response {
+			printGPUStatus(resp, true)
+		}
+		printGPUSummary(len(response))
+	} else {
+		for _, resp := range response {
+			printGPUSpec(resp, true)
+		}
+		printGPUSummary(len(response))
+	}
+	return nil
+}
+
+func printGPUBadPageHeader() {
+	hdrLine := strings.Repeat("-", 80)
+	fmt.Println(hdrLine)
+	fmt.Printf("%-40s%-16s%-12s%-12s\n",
+		"GPU", "PageAddress", "PageSize", "Status")
+	fmt.Println(hdrLine)
+}
+
+func gpuBadPageShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	var req *aga.GPUBadPageGetRequest
+	var rsp *aga.GPUBadPageGetResponse
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get for specific GPU
+		req = &aga.GPUBadPageGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get for all GPUs
+		req = &aga.GPUBadPageGetRequest{
+			Id: [][]byte{},
+		}
+	}
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewDebugGPUSvcClient(c)
+	stream, err := client.GPUBadPageGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting mapping failed, err %v", err)
+	}
+	firstResp := true
+	currGPU := ""
+	for {
+		rsp, err = stream.Recv()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return fmt.Errorf("GPU bad page get failure, err %v\n", err)
+		}
+		if rsp.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+			return fmt.Errorf("Operation failed with %v error", rsp.ApiStatus)
+		}
+		if firstResp == true {
+			printGPUBadPageHeader()
+			firstResp = false
+		}
+		// print GPU bad pages
+		for _, record := range rsp.Record {
+			if currGPU == utils.IdToStr(record.GetGPU()) {
+				fmt.Printf("%-40s%-16x%-12d%-12s\n", "",
+					record.GetPageAddress(), record.GetPageSize(),
+					strings.ToLower(strings.Replace(
+						record.GetPageStatus().String(),
+						"GPU_PAGE_STATUS_", "", -1)))
+			} else {
+				currGPU = utils.IdToStr(record.GetGPU())
+				fmt.Printf("%-40s%-16x%-12d%-12s\n", currGPU,
+					record.GetPageAddress(), record.GetPageSize(),
+					strings.ToLower(strings.Replace(
+						record.GetPageStatus().String(),
+						"GPU_PAGE_STATUS_", "", -1)))
+			}
+		}
+	}
+	return nil
+}
+
+func gpuStatsShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	respMsg := &aga.GPUGetResponse{}
+	var req *aga.GPUGetRequest
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get all GPUs
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{},
+		}
+	}
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU failed, err %v", err)
+	}
+
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus)
+	}
+
+	// print GPUs
+	for _, resp := range respMsg.Response {
+		printGPUStats(resp, true)
+	}
+	printGPUSummary(len(respMsg.Response))
+	return nil
+}
+
+func gpuAllShowCmdHandler(cmd *cobra.Command, args []string) error {
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd != nil {
+		if cmd.Flags().Changed("id") {
+			if err := utils.IsUUIDValid(gpuID); err != nil {
+				return err
+			}
+		}
+		cmd.SilenceUsage = true
+	}
+	respMsg := &aga.GPUGetResponse{}
+	var req *aga.GPUGetRequest
+	if cmd != nil && cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	} else {
+		// get all GPUs
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{},
+		}
+	}
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU failed, err %v", err)
+	}
+
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with %v error", respMsg.ApiStatus)
+	}
+
+	// print GPUs
+	for _, resp := range respMsg.Response {
+		printGPUSpec(resp, false)
+		printGPUStatus(resp, false)
+		printGPUStats(resp, false)
+	}
+	printGPUSummary(len(respMsg.Response))
+	return nil
+}
+
+func printGPUSummary(count int) {
+	fmt.Printf("\nNo. of gpus : %d\n\n", count)
+}
+
+func printGPUSpec(gpu *aga.GPU, specOnly bool) {
+	spec := gpu.GetSpec()
+	status := gpu.GetStatus()
+
+	fmt.Printf("%-40s : %s (%d)\n", "Id", utils.IdToStr(spec.GetId()),
+		status.GetIndex())
+	if spec.GetAdminState() != aga.GPUAdminState_GPU_ADMIN_STATE_NONE {
+		fmt.Printf("%-40s : %s\n", "Admin state",
+			strings.ToLower(strings.Replace(spec.GetAdminState().String(),
+				"GPU_ADMIN_STATE_", "", -1)))
+	}
+	if spec.GetOverDriveLevel() != UINT32_MAX_VAL_UINT32 {
+		fmt.Printf("%-40s : %v\n", "Clock overdrive level",
+			spec.GetOverDriveLevel())
+	}
+	if spec.GetGPUPowerCap() != 0 {
+		fmt.Printf("%-40s : %d\n", "Power overdrive (in watts)",
+			spec.GetGPUPowerCap())
+	}
+	if spec.GetPerformanceLevel() !=
+		aga.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE {
+		fmt.Printf("%-40s : %s\n", "Performance level",
+			strings.ToLower(strings.Replace(spec.GetPerformanceLevel().String(),
+				"GPU_PERF_LEVEL_", "", -1)))
+	}
+	for _, clockFreq := range spec.GetClockFrequency() {
+		if clockFreq.GetLowFrequency() != UINT32_MAX_VAL_UINT32 &&
+			clockFreq.GetHighFrequency() != UINT32_MAX_VAL_UINT32 {
+			fmt.Printf("%-40s : %s\n", "GPU clock type",
+				strings.Replace(clockFreq.GetClockType().String(),
+					"GPU_CLOCK_TYPE_", "", -1))
+			fmt.Printf("  %-38s : %d - %d\n",
+				"Frequency range (in MHz)",
+				clockFreq.GetLowFrequency(),
+				clockFreq.GetHighFrequency())
+		}
+	}
+	if spec.GetFanSpeed() != UINT64_MAX_VAL {
+		fmt.Printf("%-40s : %v\n", "Fan speed", spec.GetFanSpeed())
+	}
+	if spec.GetComputePartitionType() !=
+		aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_NONE {
+		fmt.Printf("%-40s : %s\n", "Compute partition type",
+			strings.Replace(spec.GetComputePartitionType().String(),
+				"GPU_COMPUTE_PARTITION_TYPE_", "", -1))
+	}
+	if spec.GetMemoryPartitionType() !=
+		aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NONE {
+		fmt.Printf("%-40s : %s\n", "Memory partition type",
+			strings.Replace(spec.GetMemoryPartitionType().String(),
+				"GPU_MEMORY_PARTITION_TYPE_", "", -1))
+	}
+	// TODO: fill GPU RAS Spec
+	if specOnly {
+		fmt.Printf("\n%s\n", strings.Repeat("-", 80))
+	}
+}
+
+func printPCIeStatusHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "PCIe status : \n")
+		printHdr = true
+	}
+}
+
+func printVRAMStatusHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "VRAM status:\n")
+		printHdr = true
+	}
+}
+
+func printGPUStatus(gpu *aga.GPU, statusOnly bool) {
+	var indent string
+	spec := gpu.GetSpec()
+	status := gpu.GetStatus()
+
+	if statusOnly {
+		fmt.Printf("\n%-38s : %s (%d)\n", "GPU id", utils.IdToStr(spec.GetId()),
+			status.GetIndex())
+		indent = ""
+	} else {
+		fmt.Printf("\nStatus :\n")
+		indent = "  "
+	}
+	fmt.Printf(indent+"%-38s : %d\n", "Index", status.GetIndex())
+	fmt.Printf(indent+"%-38s : %d\n", "KFD id", status.GetKFDId())
+	fmt.Printf(indent+"%-38s : %d\n", "DRM render id", status.GetDRMRenderId())
+	fmt.Printf(indent+"%-38s : %d\n", "DRM card id", status.GetDRMCardId())
+	fmt.Printf(indent+"%-38s : %s\n", "Virtualization mode",
+		strings.ToLower(strings.Replace(status.GetVirtualizationMode().String(),
+			"GPU_VIRTUALIZATION_MODE_", "", -1)))
+	fmt.Printf(indent+"%-38s : 0x%x\n", "GPU handle", status.GetGPUHandle())
+	if status.GetSerialNum() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Serial number",
+			status.GetSerialNum())
+	}
+	if status.GetCardSeries() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Card series", status.GetCardSeries())
+	}
+	if status.GetCardModel() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Card model", status.GetCardModel())
+	}
+	if status.GetCardVendor() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Card vendor", status.GetCardVendor())
+	}
+	if status.GetCardSKU() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Card SKU", status.GetCardSKU())
+	}
+	if status.GetDriverVersion() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Driver version",
+			status.GetDriverVersion())
+	}
+	if status.GetVBIOSPartNumber() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "VBIOS part number",
+			status.GetVBIOSPartNumber())
+	}
+	if status.GetVBIOSVersion() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "VBIOS version",
+			status.GetVBIOSVersion())
+	}
+	switch spec.GetComputePartitionType() {
+	case aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_NONE:
+		break
+	default:
+		fmt.Printf(indent+"%-38s : %d\n", "Partition Id",
+			status.GetPartitionId())
+	}
+	fwVer := status.GetFirmwareVersion()
+	if len(fwVer) != 0 {
+		fmt.Printf(indent + "Firmware versions:\n")
+		for i := 0; i < len(fwVer); i++ {
+			fwVerStr := fmt.Sprintf("%s %s", fwVer[i].GetFirmware(),
+				"firmware version")
+			fmt.Printf(indent+"  %-36s : %s\n", fwVerStr, fwVer[i].GetVersion())
+		}
+	}
+	if status.GetMemoryVendor() != "" {
+		fmt.Printf(indent+"%-38s : %s\n", "Memory vendor",
+			status.GetMemoryVendor())
+	}
+	if status.GetOperStatus() != aga.GPUOperStatus_GPU_OPER_STATUS_NONE {
+		fmt.Printf(indent+"%-38s : %s\n", "Operational status",
+			strings.ToLower(strings.Replace(status.GetOperStatus().String(),
+				"GPU_OPER_STATUS_", "", -1)))
+	}
+	clkStr := ""
+	idxr := 0
+	for _, clkStatus := range status.GetClockStatus() {
+		curClkStr := strings.Replace(clkStatus.GetType().String(),
+			"GPU_CLOCK_TYPE_", "", -1)
+		if clkStr != curClkStr {
+			clkStr = curClkStr
+			idxr = 0
+		}
+		if clkStatus.GetType() != aga.GPUClockType_GPU_CLOCK_TYPE_NONE &&
+			clkStatus.GetFrequency() != 0 &&
+			clkStatus.GetFrequency() != UINT16_MAX_VAL_UINT32 &&
+			clkStatus.GetFrequency() != UINT32_MAX_VAL_UINT32 &&
+			clkStatus.GetLowFrequency() != UINT32_MAX_VAL_UINT32 &&
+			clkStatus.GetHighFrequency() != UINT32_MAX_VAL_UINT32 {
+			fmt.Printf(indent+"%-38s : %s_%d\n", "GPU clock type", clkStr, idxr)
+			fmt.Printf(indent+"  %-36s : %d\n", "Frequency (in MHz)",
+				clkStatus.GetFrequency())
+			fmt.Printf(indent+"  %-36s : %d - %d\n", "Frequency range (in MHz)",
+				clkStatus.GetLowFrequency(), clkStatus.GetHighFrequency())
+			if clkStatus.GetLocked() {
+				fmt.Printf(indent+"  %-36s : true\n", "Clock locked")
+			}
+			if clkStatus.GetDeepSleep() {
+				fmt.Printf(indent+"  %-36s : true\n", "Deep sleep enabled")
+			}
+		}
+		idxr++
+	}
+	kfdPids := status.GetKFDProcessId()
+	if len(kfdPids) != 0 {
+		kfdPidStr := fmt.Sprintf("%-38s : ", "KFD process id using GPU")
+		for i := 0; i < len(kfdPids); i++ {
+			fmt.Printf(indent+"%-41s%d\n", kfdPidStr, kfdPids[i])
+			kfdPidStr = ""
+		}
+	}
+	// TODO: fill GPU RAS status
+	xgmiStatus := status.GetXGMIStatus()
+	if xgmiStatus.GetErrorStatus() !=
+		aga.GPUXGMIErrorStatus_GPU_XGMI_STATUS_NONE {
+		fmt.Printf(indent+"%-38s : %s\n", "XGMI error status",
+			strings.ToLower(strings.Replace(
+				xgmiStatus.GetErrorStatus().String(), "GPU_XGMI_STATUS_",
+				"", -1)))
+	}
+	if xgmiStatus.GetWidth() != 0 &&
+		xgmiStatus.GetWidth() != UINT16_MAX_VAL_UINT64 &&
+		xgmiStatus.GetWidth() != UINT64_MAX_VAL {
+		fmt.Printf(indent+"%-38s : %v\n", "XGMI link width (in GB/s)",
+			xgmiStatus.GetWidth())
+	}
+	if xgmiStatus.GetSpeed() != 0 &&
+		xgmiStatus.GetSpeed() != UINT16_MAX_VAL_UINT64 &&
+		xgmiStatus.GetSpeed() != UINT64_MAX_VAL {
+		fmt.Printf(indent+"%-38s : %v\n", "XGMI link speed (in GB/s)",
+			xgmiStatus.GetSpeed())
+	}
+	if status.GetThrottlingStatus() !=
+		aga.GPUThrottlingStatus_GPU_THROTTLING_STATUS_NONE {
+		fmt.Printf(indent+"%-38s : %s\n", "GPU throttling",
+			strings.ToLower(strings.Replace(
+				status.GetThrottlingStatus().String(), "GPU_THROTTLING_STATUS_",
+				"", -1)))
+	}
+	if (status.GetFWTimestamp() != 0) &&
+		(status.GetFWTimestamp() != UINT64_MAX_VAL) {
+		fmt.Printf(indent+"%-38s : %v\n", "FW timestamp (in ns)",
+			status.GetFWTimestamp())
+	}
+	/* commenting voltage-curve-point display for time being until it is added
+	   back to status proto
+		vcp := status.GetVoltageCurvePoint()
+		if len(vcp) != 0 {
+			valid_vc := false
+			for i := 0; i < len(vcp); i++ {
+				if vcp[i].GetFrequency() != 0 || vcp[i].GetVoltage() != 0 {
+					valid_vc = true
+				}
+			}
+			if valid_vc {
+				fmt.Printf(indent+"Voltage curve points:\n")
+				for i := 0; i < len(vcp); i++ {
+					if vcp[i].GetFrequency() != 0 || vcp[i].GetVoltage() != 0 {
+						fmt.Printf(indent+"  %-36s : %d\n", "Curve point",
+						vcp[i].GetPoint())
+						fmt.Printf(indent+"    %-34s : %d\n",
+						"Frequency (in MHz)",
+							vcp[i].GetFrequency())
+						fmt.Printf(indent+"    %-34s : %d\n", "Voltage (in mV)",
+							vcp[i].GetVoltage())
+					}
+				}
+			}
+		}
+	*/
+	if status.GetPCIeStatus() != nil {
+		printHdr = false
+		pcie := status.GetPCIeStatus()
+		if pcie.GetVersion() != 0 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Version", pcie.GetVersion())
+		}
+		if pcie.GetSlotType() != aga.PCIeSlotType_PCIE_SLOT_TYPE_NONE {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %s\n", "Card form factor",
+				strings.ToLower(strings.Replace(pcie.GetSlotType().String(),
+					"PCIE_SLOT_TYPE_", "", -1)))
+		}
+		if pcie.GetPCIeBusId() != "" {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %s\n", "Bus id", pcie.GetPCIeBusId())
+		}
+		if pcie.GetWidth() != 0 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Current number of lanes",
+				pcie.GetWidth())
+		}
+		if pcie.GetMaxWidth() != 0 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Maximum number of lanes",
+				pcie.GetMaxWidth())
+		}
+		if pcie.GetSpeed() != 0 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Current speed (in GT/s)",
+				pcie.GetSpeed())
+		}
+		if pcie.GetMaxSpeed() != 0 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Maximum speed (in GT/s)",
+				pcie.GetMaxSpeed())
+		}
+		if pcie.GetBandwidth() != 0 &&
+			pcie.GetBandwidth() != UINT32_MAX_VAL_UINT64 {
+			printPCIeStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Current bandwidth (in MB/s)",
+				pcie.GetBandwidth())
+		}
+	}
+	if status.GetVRAMStatus() != nil {
+		printHdr = false
+		vram := status.GetVRAMStatus()
+		if vram.GetType() != aga.VRAMType_VRAM_TYPE_NONE {
+			printVRAMStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %s\n", "VRAM type",
+				strings.ToLower(strings.Replace(vram.GetType().String(),
+					"VRAM_TYPE_", "", -1)))
+		}
+		if vram.GetVendor() != "" {
+			printVRAMStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %s\n", "VRAM vendor",
+				strings.ToLower(vram.GetVendor()))
+		} else {
+			fmt.Printf(indent+"  %-36s : %s\n", "VRAM vendor", "-")
+		}
+		if vram.GetSize_() != 0 {
+			printVRAMStatusHdr(indent)
+			fmt.Printf(indent+"  %-36s : %v\n", "VRAM size (in MB)",
+				vram.GetSize_())
+		}
+	}
+	if statusOnly {
+		fmt.Printf("\n%s\n", strings.Repeat("-", 80))
+	}
+}
+
+func printVRAMUsageHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "VRAM usage:\n")
+		printHdr = true
+	}
+}
+
+func printVoltageHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "GPU voltage statistics:\n")
+		printHdr = true
+	}
+}
+
+func printTemperatureHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "Temperature information:\n")
+		printHdr = true
+	}
+}
+
+func printPCIeHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "PCIe statistics:\n")
+		printHdr = true
+	}
+}
+
+func printUsageHdr(indent string) {
+	if printHdr == false {
+		fmt.Printf(indent + "Current GPU usage:\n")
+		printHdr = true
+	}
+}
+
+func printGPUStats(gpu *aga.GPU, statsOnly bool) {
+	var indent string
+	spec := gpu.GetSpec()
+	stats := gpu.GetStats()
+	status := gpu.GetStatus()
+
+	if statsOnly {
+		fmt.Printf("\n%-38s : %s (%d)\n", "GPU id", utils.IdToStr(spec.GetId()),
+			status.GetIndex())
+		indent = ""
+	} else {
+		fmt.Printf("\nStatistics :\n")
+		indent = "  "
+	}
+	if stats.GetPackagePower() != 0 &&
+		stats.GetPackagePower() != UINT16_MAX_VAL_UINT64 {
+		fmt.Printf(indent+"%-38s : %d\n", "Current graphics power (in Watts)",
+			stats.GetPackagePower())
+	}
+	if stats.GetAvgPackagePower() != 0 &&
+		stats.GetAvgPackagePower() != UINT16_MAX_VAL_UINT64 {
+		fmt.Printf(indent+"%-38s : %d\n", "Average graphics power (in Watts)",
+			stats.GetAvgPackagePower())
+	}
+	if stats.GetTemperature() != nil {
+		printHdr = false
+		if stats.GetTemperature().GetEdgeTemperature() != 0 &&
+			stats.GetTemperature().GetEdgeTemperature() != FLOAT32_INVALID_VAL {
+			printTemperatureHdr(indent)
+			fmt.Printf(indent+"  %-36s : %.1f\n", "Edge temperature (in C)",
+				stats.GetTemperature().GetEdgeTemperature())
+		}
+		if stats.GetTemperature().GetJunctionTemperature() != 0 &&
+			stats.GetTemperature().GetJunctionTemperature() !=
+				FLOAT32_INVALID_VAL {
+			printTemperatureHdr(indent)
+			fmt.Printf(indent+"  %-36s : %.1f\n", "Junction temperature (in C)",
+				stats.GetTemperature().GetJunctionTemperature())
+		}
+		if stats.GetTemperature().GetMemoryTemperature() != 0 &&
+			stats.GetTemperature().GetMemoryTemperature() !=
+				FLOAT32_INVALID_VAL {
+			printTemperatureHdr(indent)
+			fmt.Printf(indent+"  %-36s : %.1f\n", "VRAM temperature (in C)",
+				stats.GetTemperature().GetMemoryTemperature())
+		}
+		hbmTemp := stats.GetTemperature().GetHBMTemperature()
+		for index, temp := range hbmTemp {
+			if temp != 0 && temp != FLOAT32_INVALID_VAL {
+				printTemperatureHdr(indent)
+				hbmStr := "HBM " + strconv.Itoa(index) + " temperature (in C)"
+				fmt.Printf(indent+"  %-36s : %.1f\n", hbmStr, temp)
+			}
+		}
+	}
+	if stats.GetUsage() != nil {
+		printHdr = false
+		if stats.GetUsage().GetGFXActivity() != 0 &&
+			stats.GetUsage().GetGFXActivity() != UINT32_MAX_VAL_UINT32 &&
+			stats.GetUsage().GetGFXActivity() <= 100 {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "GFX activity",
+				stats.GetUsage().GetGFXActivity())
+		}
+		if stats.GetUsage().GetUMCActivity() != 0 &&
+			stats.GetUsage().GetUMCActivity() != UINT16_MAX_VAL_UINT32 &&
+			stats.GetUsage().GetUMCActivity() <= 100 {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "UMC activity",
+				stats.GetUsage().GetUMCActivity())
+		}
+		if stats.GetUsage().GetMMActivity() != 0 &&
+			stats.GetUsage().GetMMActivity() != UINT16_MAX_VAL_UINT32 &&
+			stats.GetUsage().GetMMActivity() <= 100 {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "MM activity",
+				stats.GetUsage().GetMMActivity())
+		}
+		vStr := fmt.Sprintf("  %-36s : ", "VCN activity")
+		// used to decide if vcn activity should be printed or not
+		validEntry := false
+		for _, vcn := range stats.GetUsage().GetVCNActivity() {
+			// only if at least one of the vcn activities is a valid value do we
+			// print the field
+			if vcn == UINT16_MAX_VAL_UINT32 || vcn > 100 {
+				vStr = fmt.Sprintf("%sN/A ", vStr)
+			} else {
+				validEntry = true
+				vStr = fmt.Sprintf("%s%d%% ", vStr, vcn)
+			}
+		}
+		if validEntry {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"%s\n", vStr)
+			validEntry = false
+		}
+		jStr := fmt.Sprintf("  %-36s : ", "JPEG activity")
+		for i, jpeg := range stats.GetUsage().GetJPEGActivity() {
+			// only if at least one of the jpeg activities is a valid value do
+			// we print the field
+			if jpeg == UINT16_MAX_VAL_UINT32 || jpeg > 100 {
+				jStr = fmt.Sprintf("%sN/A ", jStr)
+			} else {
+				validEntry = true
+				jStr = fmt.Sprintf("%s%d%% ", jStr, jpeg)
+			}
+			if (i+1)%8 == 0 {
+				jStr = fmt.Sprintf("%s\n%s%-41s", jStr, indent, "")
+			}
+		}
+		if validEntry {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"%s\n", jStr)
+			validEntry = false
+		}
+		gStr := fmt.Sprintf("  %-36s : ", "GFX utilization")
+		for _, gfx := range stats.GetUsage().GetGFXBusyInst() {
+			// only if at least one of the gfx busy value is a valid value do we
+			// print the field
+			if gfx == UINT16_MAX_VAL_UINT32 || gfx > 100 {
+				gStr = fmt.Sprintf("%sN/A ", gStr)
+			} else {
+				validEntry = true
+				gStr = fmt.Sprintf("%s%d%% ", gStr, gfx)
+			}
+		}
+		if validEntry {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"%s\n", gStr)
+			validEntry = false
+		}
+		vStr = fmt.Sprintf("  %-36s : ", "VCN utilization")
+		for _, vcn := range stats.GetUsage().GetVCNBusyInst() {
+			// only if at least one of the vcn busy value is a valid value do we
+			// print the field
+			if vcn == UINT16_MAX_VAL_UINT32 || vcn > 100 {
+				vStr = fmt.Sprintf("%sN/A ", vStr)
+			} else {
+				validEntry = true
+				vStr = fmt.Sprintf("%s%d%% ", vStr, vcn)
+			}
+		}
+		if validEntry {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"%s\n", vStr)
+			validEntry = false
+		}
+		jStr = fmt.Sprintf("  %-36s : ", "JPEG utilization")
+		for i, jpeg := range stats.GetUsage().GetJPEGBusyInst() {
+			// only if at least one of the jpeg busy value is a valid value do
+			// we print the field
+			if jpeg == UINT16_MAX_VAL_UINT32 || jpeg > 100 {
+				jStr = fmt.Sprintf("%sN/A ", jStr)
+			} else {
+				validEntry = true
+				jStr = fmt.Sprintf("%s%d%% ", jStr, jpeg)
+			}
+			if (i+1)%8 == 0 {
+				jStr = fmt.Sprintf("%s\n%s%-41s", jStr, indent, "")
+			}
+		}
+		if validEntry {
+			printUsageHdr(indent)
+			fmt.Printf(indent+"%s\n", jStr)
+			validEntry = false
+		}
+	}
+	if stats.GetVoltage() != nil {
+		v := stats.GetVoltage()
+		printHdr = false
+		if v.GetVoltage() != 0 && v.GetVoltage() != UINT16_MAX_VAL_UINT64 {
+			printVoltageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Current voltage (in mV)",
+				v.GetVoltage())
+		}
+		if v.GetGFXVoltage() != 0 &&
+			v.GetGFXVoltage() != UINT16_MAX_VAL_UINT64 {
+			printVoltageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n",
+				"Current graphics voltage(in mV)", v.GetGFXVoltage())
+		}
+		if v.GetMemoryVoltage() != 0 &&
+			v.GetMemoryVoltage() != UINT16_MAX_VAL_UINT64 {
+			printVoltageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Current memory voltage(in mV)",
+				v.GetMemoryVoltage())
+		}
+	}
+	if stats.GetPCIeStats() != nil {
+		printHdr = false
+		p := stats.GetPCIeStats()
+		if p.GetReplayCount() != 0 && p.GetReplayCount() != UINT64_MAX_VAL {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Replay count",
+				p.GetReplayCount())
+		}
+		if p.GetRecoveryCount() != 0 &&
+			p.GetRecoveryCount() != UINT64_MAX_VAL {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Recovery count",
+				p.GetRecoveryCount())
+		}
+		if p.GetReplayRolloverCount() != 0 &&
+			p.GetReplayRolloverCount() != UINT64_MAX_VAL {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Replay rollover count",
+				p.GetReplayRolloverCount())
+		}
+		if p.GetNACKSentCount() != 0 &&
+			p.GetNACKSentCount() != UINT64_MAX_VAL &&
+			p.GetNACKSentCount() != UINT32_MAX_VAL_UINT64 {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "NACKs sent",
+				p.GetNACKSentCount())
+		}
+		if p.GetNACKReceivedCount() != 0 &&
+			p.GetNACKReceivedCount() != UINT64_MAX_VAL &&
+			p.GetNACKReceivedCount() != UINT32_MAX_VAL_UINT64 {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "NACKs received",
+				p.GetNACKReceivedCount())
+		}
+		if p.GetRxBytes() != UINT64_MAX_VAL {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Total received bytes",
+				p.GetRxBytes())
+		}
+		if p.GetTxBytes() != UINT64_MAX_VAL {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Total transmitted bytes",
+				p.GetTxBytes())
+		}
+		if p.GetBiDirBandwidth() != UINT16_MAX_VAL_UINT64 {
+			printPCIeHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n",
+				"Bidirectional bandwidth (in GB/s)",
+				p.GetBiDirBandwidth())
+		}
+	}
+	if stats.GetVRAMUsage() != nil {
+		printHdr = false
+		vram := stats.GetVRAMUsage()
+		if vram.GetTotalVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Total VRAM (in MB)",
+				vram.GetTotalVRAM())
+		}
+		if vram.GetUsedVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Used VRAM (in MB)",
+				vram.GetUsedVRAM())
+		}
+		if vram.GetFreeVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Free VRAM (in MB)",
+				vram.GetFreeVRAM())
+		}
+		if vram.GetTotalVisibleVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Total visible VRAM (in MB)",
+				vram.GetTotalVisibleVRAM())
+		}
+		if vram.GetUsedVisibleVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Used visible VRAM (in MB)",
+				vram.GetUsedVisibleVRAM())
+		}
+		if vram.GetFreeVisibleVRAM() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Free visible VRAM (in MB)",
+				vram.GetFreeVisibleVRAM())
+		}
+		if vram.GetTotalGTT() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Total GTT (in MB)",
+				vram.GetTotalGTT())
+		}
+		if vram.GetUsedGTT() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Used GTT (in MB)",
+				vram.GetUsedGTT())
+		}
+		if vram.GetFreeGTT() != 0 {
+			printVRAMUsageHdr(indent)
+			fmt.Printf(indent+"  %-36s : %d\n", "Free GTT (in MB)",
+				vram.GetFreeGTT())
+		}
+	}
+	if stats.GetEnergyConsumed() != 0 {
+		fmt.Printf(indent+"%-38s : %.2f\n",
+			"Accumulated energy consumed (in uJ)",
+			stats.GetEnergyConsumed())
+	}
+	if stats.GetTotalCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Total correctable errors",
+			stats.GetTotalCorrectableErrors())
+	}
+	if stats.GetTotalUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Total uncorrectable errors",
+			stats.GetTotalUncorrectableErrors())
+	}
+	if stats.GetSDMACorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SDMA correctable errors",
+			stats.GetSDMACorrectableErrors())
+	}
+	if stats.GetSDMAUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SDMA uncorrectable errors",
+			stats.GetSDMAUncorrectableErrors())
+	}
+	if stats.GetGFXCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "GFX correctable errors",
+			stats.GetGFXCorrectableErrors())
+	}
+	if stats.GetGFXUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "GFX uncorrectable errors",
+			stats.GetGFXUncorrectableErrors())
+	}
+	if stats.GetMMHUBCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MMHUB correctable errors",
+			stats.GetMMHUBCorrectableErrors())
+	}
+	if stats.GetMMHUBUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MMHUB uncorrectable errors",
+			stats.GetMMHUBUncorrectableErrors())
+	}
+	if stats.GetATHUBCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "ATHUB correctable errors",
+			stats.GetATHUBCorrectableErrors())
+	}
+	if stats.GetATHUBUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "ATHUB uncorrectable errors",
+			stats.GetATHUBUncorrectableErrors())
+	}
+	if stats.GetBIFCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "BIF correctable errors",
+			stats.GetBIFCorrectableErrors())
+	}
+	if stats.GetBIFUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "BIF uncorrectable errors",
+			stats.GetBIFUncorrectableErrors())
+	}
+	if stats.GetHDPCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "HDP correctable errors",
+			stats.GetHDPCorrectableErrors())
+	}
+	if stats.GetHDPUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "HDP uncorrectable errors",
+			stats.GetHDPUncorrectableErrors())
+	}
+	if stats.GetXGMIWAFLCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "XGMI WAFL correctable errors",
+			stats.GetXGMIWAFLCorrectableErrors())
+	}
+	if stats.GetXGMIWAFLUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "XGMI WAFL uncorrectable errors",
+			stats.GetXGMIWAFLUncorrectableErrors())
+	}
+	if stats.GetDFCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "DF correctable errors",
+			stats.GetDFCorrectableErrors())
+	}
+	if stats.GetDFUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "DF uncorrectable errors",
+			stats.GetDFUncorrectableErrors())
+	}
+	if stats.GetSMNCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SMN correctable errors",
+			stats.GetSMNCorrectableErrors())
+	}
+	if stats.GetSMNUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SMN uncorrectable errors",
+			stats.GetSMNUncorrectableErrors())
+	}
+	if stats.GetSEMCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SEM correctable errors",
+			stats.GetSEMCorrectableErrors())
+	}
+	if stats.GetSEMUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "SEM uncorrectable errors",
+			stats.GetSEMUncorrectableErrors())
+	}
+	if stats.GetMP0CorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MP0 correctable errors",
+			stats.GetMP0CorrectableErrors())
+	}
+	if stats.GetMP0UncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MP0 uncorrectable errors",
+			stats.GetMP0UncorrectableErrors())
+	}
+	if stats.GetMP1CorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MP1 correctable errors",
+			stats.GetMP1CorrectableErrors())
+	}
+	if stats.GetMP1UncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MP1 uncorrectable errors",
+			stats.GetMP1UncorrectableErrors())
+	}
+	if stats.GetFUSECorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "FUSE correctable errors",
+			stats.GetFUSECorrectableErrors())
+	}
+	if stats.GetFUSEUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "FUSE uncorrectable errors",
+			stats.GetFUSEUncorrectableErrors())
+	}
+	if stats.GetUMCCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "UMC correctable errors",
+			stats.GetUMCCorrectableErrors())
+	}
+	if stats.GetUMCUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "UMC uncorrectable errors",
+			stats.GetUMCUncorrectableErrors())
+	}
+	if stats.GetMCACorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MCA correctable errors",
+			stats.GetMCACorrectableErrors())
+	}
+	if stats.GetMCAUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MCA uncorrectable errors",
+			stats.GetMCAUncorrectableErrors())
+	}
+	if stats.GetVCNCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "VCN correctable errors",
+			stats.GetVCNCorrectableErrors())
+	}
+	if stats.GetVCNUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "VCN uncorrectable errors",
+			stats.GetVCNUncorrectableErrors())
+	}
+	if stats.GetJPEGCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "JPEG correctable errors",
+			stats.GetJPEGCorrectableErrors())
+	}
+	if stats.GetJPEGUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "JPEG uncorrectable errors",
+			stats.GetJPEGUncorrectableErrors())
+	}
+	if stats.GetIHCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "IH correctable errors",
+			stats.GetIHCorrectableErrors())
+	}
+	if stats.GetIHUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "IH uncorrectable errors",
+			stats.GetIHUncorrectableErrors())
+	}
+	if stats.GetMPIOCorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MPIO correctable errors",
+			stats.GetMPIOCorrectableErrors())
+	}
+	if stats.GetMPIOUncorrectableErrors() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "MPIO uncorrectable errors",
+			stats.GetMPIOUncorrectableErrors())
+	}
+	if stats.GetXGMINeighbor0TxNOPs() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Nops sent to XGMI neighbor0",
+			stats.GetXGMINeighbor0TxNOPs())
+	}
+	if stats.GetXGMINeighbor0TxRequests() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Outgoing requests to XGMI neighbor0",
+			stats.GetXGMINeighbor0TxRequests())
+	}
+	if stats.GetXGMINeighbor0TxResponses() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Outgoing responses to XGMI neighbor0",
+			stats.GetXGMINeighbor0TxRequests())
+	}
+	if stats.GetXGMINeighbor0TXBeats() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Data beats sent to neighbor0",
+			stats.GetXGMINeighbor0TXBeats())
+	}
+	if stats.GetXGMINeighbor1TxNOPs() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Nops sent to XGMI neighbor1",
+			stats.GetXGMINeighbor1TxNOPs())
+	}
+	if stats.GetXGMINeighbor1TxRequests() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Outgoing requests to XGMI neighbor1",
+			stats.GetXGMINeighbor1TxRequests())
+	}
+	if stats.GetXGMINeighbor1TxResponses() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Outgoing responses to XGMI neighbor1",
+			stats.GetXGMINeighbor1TxRequests())
+	}
+	if stats.GetXGMINeighbor1TXBeats() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n", "Data beats sent to neighbor1",
+			stats.GetXGMINeighbor1TXBeats())
+	}
+	if stats.GetXGMINeighbor0TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor0 (in BPS)",
+			stats.GetXGMINeighbor0TxThroughput())
+	}
+	if stats.GetXGMINeighbor1TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor1 (in BPS)",
+			stats.GetXGMINeighbor1TxThroughput())
+	}
+	if stats.GetXGMINeighbor2TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor2 (in BPS)",
+			stats.GetXGMINeighbor2TxThroughput())
+	}
+	if stats.GetXGMINeighbor3TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor3 (in BPS)",
+			stats.GetXGMINeighbor3TxThroughput())
+	}
+	if stats.GetXGMINeighbor4TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor4 (in BPS)",
+			stats.GetXGMINeighbor4TxThroughput())
+	}
+	if stats.GetXGMINeighbor5TxThroughput() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Tx throughput to XGMI neighbor5 (in BPS)",
+			stats.GetXGMINeighbor5TxThroughput())
+	}
+	if stats.GetPowerUsage() != 0 {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Power usage (in Watts)", stats.GetPowerUsage())
+	}
+	if (stats.GetFanSpeed() != 0) &&
+		(stats.GetFanSpeed() != UINT16_MAX_VAL_UINT64) {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Fan speed (in RPMs)", stats.GetFanSpeed())
+	}
+	if (stats.GetGFXActivityAccumulated() != 0) &&
+		(stats.GetGFXActivityAccumulated() != UINT64_MAX_VAL) {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"GFX activity accumulated",
+			stats.GetGFXActivityAccumulated())
+	}
+	if (stats.GetMemoryActivityAccumulated() != 0) &&
+		(stats.GetMemoryActivityAccumulated() != UINT64_MAX_VAL) {
+		fmt.Printf(indent+"%-38s : %d\n",
+			"Memory activity accumulated",
+			stats.GetMemoryActivityAccumulated())
+	}
+	for i, linkStats := range stats.GetXGMILinkStats() {
+		link := "Link " + fmt.Sprintf("%v", i+1)
+		if (linkStats.GetDataRead() != 0) &&
+			(linkStats.GetDataRead() != UINT64_MAX_VAL) {
+			fmt.Printf(indent+"%-38s : %d\n", link+" data read (in KB)",
+				linkStats.GetDataRead())
+		}
+		if (linkStats.GetDataWrite() != 0) &&
+			(linkStats.GetDataWrite() != UINT64_MAX_VAL) {
+			fmt.Printf(indent+"%-38s : %d\n", link+" data written (in KB)",
+				linkStats.GetDataWrite())
+		}
+	}
+	if stats.GetViolationStats() != nil {
+		vStats := stats.GetViolationStats()
+		if vStats.GetCurrentAccumulatedCounter() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n", "Current accumulated counter",
+				vStats.GetCurrentAccumulatedCounter())
+		}
+		if vStats.GetProcessorHotResidencyAccumulated() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n",
+				"Processor hot residency accumulated",
+				vStats.GetProcessorHotResidencyAccumulated())
+		}
+		if vStats.GetPPTResidencyAccumulated() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n", "PPT residency accumulated",
+				vStats.GetPPTResidencyAccumulated())
+		}
+		if vStats.GetSocketThermalResidencyAccumulated() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n",
+				"Socket thermal residency accumulated",
+				vStats.GetSocketThermalResidencyAccumulated())
+		}
+		if vStats.GetVRThermalResidencyAccumulated() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n",
+				"VR thermal residency accumulated",
+				vStats.GetVRThermalResidencyAccumulated())
+		}
+		if vStats.GetVRThermalResidencyAccumulated() != UINT64_MAX_VAL {
+			fmt.Printf(indent+"%-38s : %d\n",
+				"HBM thermal residency accumulated",
+				vStats.GetHBMThermalResidencyAccumulated())
+		}
+	}
+
+	fmt.Printf("\n%s\n", strings.Repeat("-", 80))
+}
+
+type ShadowGPU struct {
+	Id string
+	*aga.GPUSpec
+	*aga.GPUStatus
+	*aga.GPUStats
+}
+
+func NewGPU(resp *aga.GPU) *ShadowGPU {
+	return &ShadowGPU{
+		Id:        utils.IdToStr(resp.GetSpec().GetId()),
+		GPUSpec:   resp.GetSpec(),
+		GPUStatus: resp.GetStatus(),
+		GPUStats:  resp.GetStats(),
+	}
+}
+
+func printGPUJson(resp *aga.GPU) {
+	gpu := NewGPU(resp)
+	b, _ := json.MarshalIndent(gpu, "  ", "  ")
+	fmt.Printf("  %s", string(b))
+}
+
+func gpuUpdateCmdPreRunE(cmd *cobra.Command, args []string) error {
+	if cmd == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+	if cmd.Flags().NFlag() == 1 {
+		return fmt.Errorf("Nothing to update")
+	}
+	if err := utils.IsUUIDValid(gpuID); err != nil {
+		return err
+	}
+	if cmd.Flags().Changed("admin-state") {
+		switch strings.ToLower(gpuAdminState) {
+		case "up":
+			gpuAdminStateVal = aga.GPUAdminState_GPU_ADMIN_STATE_UP
+		case "down":
+			gpuAdminStateVal = aga.GPUAdminState_GPU_ADMIN_STATE_DOWN
+		default:
+			return fmt.Errorf("Invalid argument for \"admin-state\", please " +
+				"refer help")
+		}
+	}
+	if cmd.Flags().Changed("compute-partition") {
+		switch strings.ToLower(computePartition) {
+		case "spx":
+			computePartitionVal =
+				aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_SPX
+		case "dpx":
+			computePartitionVal =
+				aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_DPX
+		case "tpx":
+			computePartitionVal =
+				aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_TPX
+		case "qpx":
+			computePartitionVal =
+				aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_QPX
+		case "cpx":
+			computePartitionVal =
+				aga.GPUComputePartitionType_GPU_COMPUTE_PARTITION_TYPE_CPX
+		default:
+			return fmt.Errorf("Invalid argument for \"compute-partition\", " +
+				"please refer help")
+		}
+	}
+	if cmd.Flags().Changed("memory-partition") {
+		switch strings.ToLower(memPartition) {
+		case "nps1":
+			memPartitionVal =
+				aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS1
+		case "nps2":
+			memPartitionVal =
+				aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS2
+		case "nps4":
+			memPartitionVal =
+				aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS4
+		case "nps8":
+			memPartitionVal =
+				aga.GPUMemoryPartitionType_GPU_MEMORY_PARTITION_TYPE_NPS8
+		default:
+			return fmt.Errorf("Invalid argument for \"memory-partition\", " +
+				"please refer help")
+		}
+	}
+	if cmd.Flags().Changed("perf-level") {
+		switch strings.ToLower(perfLevel) {
+		case "none":
+			PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE
+		case "auto":
+			PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_AUTO
+		case "low":
+			PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_LOW
+		case "high":
+			PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_HIGH
+		case "deterministic":
+			PerformanceLevelVal =
+				aga.GPUPerformanceLevel_GPU_PERF_LEVEL_DETERMINISTIC
+		case "memclock":
+			PerformanceLevelVal =
+				aga.GPUPerformanceLevel_GPU_PERF_LEVEL_STABLE_MIN_MCLK
+		case "sysclock":
+			PerformanceLevelVal =
+				aga.GPUPerformanceLevel_GPU_PERF_LEVEL_STABLE_MIN_SCLK
+		case "manual":
+			PerformanceLevelVal = aga.GPUPerformanceLevel_GPU_PERF_LEVEL_MANUAL
+		default:
+			return fmt.Errorf("Invalid argument for \"perf-level\", please " +
+				"refer help")
+		}
+	}
+	if cmd.Flags().Changed("clock-frequency") !=
+		cmd.Flags().Changed("clock-type") {
+		return fmt.Errorf("Both \"clock-type\" and \"clock-frequency\" need " +
+			"to be specified")
+	}
+	if cmd.Flags().Changed("clock-type") {
+		switch strings.ToLower(gpuClkType) {
+		case "memory":
+			clockType = aga.GPUClockType_GPU_CLOCK_TYPE_MEMORY
+		case "system":
+			clockType = aga.GPUClockType_GPU_CLOCK_TYPE_SYSTEM
+		case "video":
+			clockType = aga.GPUClockType_GPU_CLOCK_TYPE_VIDEO
+		case "data":
+			clockType = aga.GPUClockType_GPU_CLOCK_TYPE_DATA
+		default:
+			return fmt.Errorf("Invalid \"clock-type\" specified, please " +
+				"refer help")
+		}
+	}
+	if cmd.Flags().Changed("clock-frequency") {
+		_, err := fmt.Sscanf(gpuClkFreq, "%d-%d", &gpuClkFreqLo, &gpuClkFreqHi)
+		if err != nil {
+			return fmt.Errorf("Invalid range for \"clock-frequency\", please " +
+				"refer help")
+		}
+	}
+	return nil
+}
+
+func gpuUpdateCmdHandler(cmd *cobra.Command, args []string) error {
+	if cmd == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	cmd.SilenceUsage = true
+
+	// get GPU spec
+	respMsg := &aga.GPUGetResponse{}
+	var req *aga.GPUGetRequest
+	if cmd.Flags().Changed("id") {
+		// get specific GPU
+		req = &aga.GPUGetRequest{
+			Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+		}
+	}
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err = client.GPUGet(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Getting GPU failed, err %v", err)
+	}
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Get GPU failed with %v error", respMsg.ApiStatus)
+	}
+	resp := respMsg.GetResponse()[0]
+	gpuSpec := resp.GetSpec()
+	if gpuSpec == nil {
+		return fmt.Errorf("GPU object not found")
+	}
+	// update the configured fields
+	updateSpec := *gpuSpec
+	if cmd.Flags().Changed("admin-state") {
+		updateSpec.AdminState = gpuAdminStateVal
+	}
+	if cmd.Flags().Changed("overdrive-level") {
+		updateSpec.OverDriveLevel = overDriveLevel
+	}
+	if cmd.Flags().Changed("power-cap") {
+		updateSpec.GPUPowerCap = powerCap
+	}
+	if cmd.Flags().Changed("perf-level") {
+		updateSpec.PerformanceLevel = PerformanceLevelVal
+	}
+	if cmd.Flags().Changed("clock-frequency") {
+		for i, freq := range updateSpec.GetClockFrequency() {
+			if freq.GetClockType() == clockType {
+				updateSpec.ClockFrequency[i] = &aga.GPUClockFrequencyRange{
+					ClockType:     clockType,
+					LowFrequency:  gpuClkFreqLo,
+					HighFrequency: gpuClkFreqHi,
+				}
+			}
+		}
+	}
+	if cmd.Flags().Changed("fan-speed") {
+		updateSpec.FanSpeed = fanSpeed
+	}
+	if cmd.Flags().Changed("compute-partition") {
+		updateSpec.ComputePartitionType = computePartitionVal
+	}
+	if cmd.Flags().Changed("memory-partition") {
+		updateSpec.MemoryPartitionType = memPartitionVal
+	}
+	reqMsg := &aga.GPUUpdateRequest{
+		Spec: []*aga.GPUSpec{
+			&updateSpec,
+		},
+	}
+	// GPU agent call
+	updateRespMsg, err := client.GPUUpdate(ctxt, reqMsg)
+	if err != nil {
+		return fmt.Errorf("Updating GPU failed, err %v", err)
+	}
+	if updateRespMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with error %v, error code %v",
+			updateRespMsg.ApiStatus, updateRespMsg.ErrorCode)
+	}
+	fmt.Printf("Updating GPU succeeded\n")
+	return nil
+}
+
+func gpuResetCmdPreRunE(cmd *cobra.Command, args []string) error {
+	if cmd == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+	if err := utils.IsUUIDValid(gpuID); err != nil {
+		return err
+	}
+	numFlags := 0
+	if cmd.Flags().Changed("clocks") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("fans") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("power-profile") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("power-overdrive") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("xgmi-error") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("perf-determinism") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("compute-partition") {
+		numFlags += 1
+	}
+	if cmd.Flags().Changed("nps-mode") {
+		numFlags += 1
+	}
+	if numFlags == 0 {
+		// more than 1 reset option is specified, reject
+		return fmt.Errorf("Invalid arguments, one of \"clocks\", \"fans\", " +
+			"\"power-profile\", \"power-overdrive\", \"xgmi-error\", " +
+			"\"perf-determinism\", \"compute-partition\", \"nps-mode\" must " +
+			"be specified")
+	}
+	// all above options are mutually exclusive
+	if numFlags > 1 {
+		// more than 1 reset option is specified, reject
+		return fmt.Errorf("Invalid arguments, \"clocks\", \"fans\", " +
+			"\"power-profile\", \"power-overdrive\", \"xgmi-error\", " +
+			"\"perf-determinism\", \"compute-partition\", \"nps-mode\" are " +
+			"mutually exlcusive, specify only one")
+	}
+	return nil
+}
+
+func gpuResetCmdHandler(cmd *cobra.Command, args []string) error {
+	if cmd == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+	if len(args) > 0 {
+		return fmt.Errorf("Invalid argument")
+	}
+	cmd.SilenceUsage = true
+
+	// connect to GPU agent
+	c, ctxt, cancel, err := utils.CreateNewAGAGRPClient()
+	if err != nil {
+		return fmt.Errorf("Could not connect to the GPU agent, is agent " +
+			"running?")
+	}
+	defer c.Close()
+	defer cancel()
+
+	req := &aga.GPUResetRequest{
+		Id: [][]byte{uuid.FromStringOrNil(gpuID).Bytes()},
+	}
+	if cmd.Flags().Changed("clocks") {
+		req.Reset_ = &aga.GPUResetRequest_ResetClocks{
+			ResetClocks: true,
+		}
+	} else if cmd.Flags().Changed("fans") {
+		req.Reset_ = &aga.GPUResetRequest_ResetFans{
+			ResetFans: true,
+		}
+	} else if cmd.Flags().Changed("power-profile") {
+		req.Reset_ = &aga.GPUResetRequest_ResetPowerProfile{
+			ResetPowerProfile: true,
+		}
+	} else if cmd.Flags().Changed("power-overdrive") {
+		req.Reset_ = &aga.GPUResetRequest_ResetPowerOverDrive{
+			ResetPowerOverDrive: true,
+		}
+	} else if cmd.Flags().Changed("xgmi-error") {
+		req.Reset_ = &aga.GPUResetRequest_ResetXGMIError{
+			ResetXGMIError: true,
+		}
+	} else if cmd.Flags().Changed("perf-determinism") {
+		req.Reset_ = &aga.GPUResetRequest_ResetPerfDeterminism{
+			ResetPerfDeterminism: true,
+		}
+	} else if cmd.Flags().Changed("compute-partition") {
+		req.Reset_ = &aga.GPUResetRequest_ResetComputePartition{
+			ResetComputePartition: true,
+		}
+	} else if cmd.Flags().Changed("nps-mode") {
+		req.Reset_ = &aga.GPUResetRequest_ResetNPSMode{
+			ResetNPSMode: true,
+		}
+	}
+	client := aga.NewGPUSvcClient(c)
+	respMsg, err := client.GPUReset(ctxt, req)
+	if err != nil {
+		return fmt.Errorf("Resetting GPU failed, err %v", err)
+	}
+	if respMsg.ApiStatus != aga.ApiStatus_API_STATUS_OK {
+		return fmt.Errorf("Operation failed with error %v, error code %v",
+			respMsg.ApiStatus, respMsg.ErrorCode)
+	}
+	fmt.Printf("Resetting GPU succeeded\n")
+	return nil
+}
diff --git a/sw/nic/gpuagent/protos/gpu.proto b/sw/nic/gpuagent/protos/gpu.proto
index dd8a2e2..a5d9e26 100644
--- a/sw/nic/gpuagent/protos/gpu.proto
+++ b/sw/nic/gpuagent/protos/gpu.proto
@@ -542,19 +542,46 @@ message GPUXGMILinkStats {
 // GPU violation stats
 message GPUViolationStats {
   // current acummulated counter
-  uint64 CurrentAccumulatedCounter         = 1;
+  uint64          CurrentAccumulatedCounter         = 1;
   // processor hot residency accumulated
-  uint64 ProcessorHotResidencyAccumulated  = 2;
+  uint64          ProcessorHotResidencyAccumulated  = 2;
   // Package Power Tracking (PPT) residency accumulated
-  uint64 PPTResidencyAccumulated           = 3;
+  uint64          PPTResidencyAccumulated           = 3;
   // socket thermal residency accumulated
-  uint64 SocketThermalResidencyAccumulated = 4;
+  uint64          SocketThermalResidencyAccumulated = 4;
   // Voltage Rail (VR) thermal residency accumulated
-  uint64 VRThermalResidencyAccumulated     = 5;
+  uint64          VRThermalResidencyAccumulated     = 5;
   // High Bandwidth Memory (HBM) thermal residency accumulated
-  uint64 HBMThermalResidencyAccumulated    = 6;
+  uint64          HBMThermalResidencyAccumulated    = 6;
+  // processor hot residency percentage
+  uint64          ProcessorHotResidencyPercentage   = 7;
+  // Package Power Tracking (PPT) residency percentage
+  uint64          PPTResidencyPercentage            = 8;
+  // socket thermal residency percentage
+  uint64          SocketThermalResidencyPercentage  = 9;
+  // Voltage Rail (VR) thermal residency percentage
+  uint64          VRThermalResidencyPercentage      = 10;
+  // High Bandwidth Memory (HBM) thermal residency percentage
+  uint64          HBMThermalResidencyPercentage     = 11;
+  // gfx clock below host limit power accumulated
+  repeated uint64 GFXBelowHostLimitPowerAccumulated = 12;
+  // gfx clock below host limit thermal accumulated
+  repeated uint64 GFXBelowHostLimitTHMAccumulated   = 13;
+  // gfx low utilization accumulated
+  repeated uint64 GFXLowUtilizationAccumulated      = 14;
+  // gfx clock below host limit total accumulated
+  repeated uint64 GFXBelowHostLimitTotalAccumulated = 15;
+  // gfx clock below host limit power percentage
+  repeated uint64 GFXBelowHostLimitPowerPercentage  = 16;
+  // gfx clock below host limit thermal percentage
+  repeated uint64 GFXBelowHostLimitTHMPercentage    = 17;
+  // gfx low utilization percentage
+  repeated uint64 GFXLowUtilizationPercentage       = 18;
+  // gfx below host limit total percentage
+  repeated uint64 GFXBelowHostLimitTotalPercentage  = 19;
 }
 
+
 // GPU statistics
 message GPUStats {
   // current graphics package power (in Watts)
diff --git a/sw/nic/gpuagent/svc/gpu_to_proto.hpp b/sw/nic/gpuagent/svc/gpu_to_proto.hpp
index 479887d..7cad3f1 100644
--- a/sw/nic/gpuagent/svc/gpu_to_proto.hpp
+++ b/sw/nic/gpuagent/svc/gpu_to_proto.hpp
@@ -610,6 +610,34 @@ aga_gpu_violation_stats_to_proto (amdgpu::GPUViolationStats *proto_stats,
                      stats->vr_thermal_residency_accumulated);
     proto_stats->set_hbmthermalresidencyaccumulated(
                      stats->hbm_thermal_residency_accumulated);
+    proto_stats->set_processorhotresidencypercentage(
+                     stats->processor_hot_residency_percentage);
+    proto_stats->set_pptresidencypercentage(
+                     stats->ppt_residency_percentage);
+    proto_stats->set_socketthermalresidencypercentage(
+                     stats->socket_thermal_residency_percentage);
+    proto_stats->set_vrthermalresidencypercentage(
+                     stats->vr_thermal_residency_percentage);
+    proto_stats->set_hbmthermalresidencypercentage(
+                     stats->hbm_thermal_residency_percentage);
+    for (uint16_t i = 0; i < AGA_GPU_MAX_XCC; i++) {
+        proto_stats->add_gfxbelowhostlimitpoweraccumulated(
+                         stats->gfx_clk_below_host_limit_power_accumulated[i]);
+        proto_stats->add_gfxbelowhostlimitthmaccumulated(
+                         stats->gfx_clk_below_host_limit_thermal_accumulated[i]);
+        proto_stats->add_gfxlowutilizationaccumulated(
+                         stats->gfx_low_utilization_accumulated[i]);
+        proto_stats->add_gfxbelowhostlimittotalaccumulated(
+                         stats->gfx_clk_below_host_limit_total_accumulated[i]);
+        proto_stats->add_gfxbelowhostlimitpowerpercentage(
+                         stats->gfx_clk_below_host_limit_power_percentage[i]);
+        proto_stats->add_gfxbelowhostlimitthmpercentage(
+                         stats->gfx_clk_below_host_limit_thermal_percentage[i]);
+        proto_stats->add_gfxlowutilizationpercentage(
+                         stats->gfx_low_utilization_percentage[i]);
+        proto_stats->add_gfxbelowhostlimittotalpercentage(
+                         stats->gfx_clk_below_host_limit_total_percentage[i]);
+    }
 }
 
 // populate proto buf stats from gpu stats
diff --git a/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig b/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig
new file mode 100644
index 0000000..479887d
--- /dev/null
+++ b/sw/nic/gpuagent/svc/gpu_to_proto.hpp.orig
@@ -0,0 +1,824 @@
+
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+
+//----------------------------------------------------------------------------
+///
+/// \file
+/// This module defines protobuf conversion APIs for gpu object
+///
+//----------------------------------------------------------------------------
+
+#ifndef __AGA_SVC_GPU_TO_PROTO_HPP__
+#define __AGA_SVC_GPU_TO_PROTO_HPP__
+
+#include <string>
+#include "nic/gpuagent/svc/gpu.hpp"
+#include "nic/gpuagent/api/include/aga_gpu.hpp"
+
+static inline amdgpu::GPUAdminState
+aga_gpu_admin_state_to_proto (aga_gpu_admin_state_t admin_state)
+{
+    switch(admin_state) {
+    case AGA_GPU_ADMIN_STATE_UP:
+        return amdgpu::GPU_ADMIN_STATE_UP;
+    case AGA_GPU_ADMIN_STATE_DOWN:
+        return amdgpu::GPU_ADMIN_STATE_DOWN;
+    case AGA_GPU_ADMIN_STATE_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_ADMIN_STATE_NONE;
+}
+
+static inline amdgpu::GPUPerformanceLevel
+aga_gpu_perf_level_to_proto (aga_gpu_perf_level_t perf_level)
+{
+    switch (perf_level) {
+    case AGA_GPU_PERF_LEVEL_AUTO:
+        return amdgpu::GPU_PERF_LEVEL_AUTO;
+    case AGA_GPU_PERF_LEVEL_LOW:
+        return amdgpu::GPU_PERF_LEVEL_LOW;
+    case AGA_GPU_PERF_LEVEL_HIGH:
+        return amdgpu::GPU_PERF_LEVEL_HIGH;
+    case AGA_GPU_PERF_LEVEL_DETERMINISTIC:
+        return amdgpu::GPU_PERF_LEVEL_DETERMINISTIC;
+    case AGA_GPU_PERF_LEVEL_STABLE_WITH_MCLK:
+        return amdgpu::GPU_PERF_LEVEL_STABLE_MIN_MCLK;
+    case AGA_GPU_PERF_LEVEL_STABLE_WITH_SCLK:
+        return amdgpu::GPU_PERF_LEVEL_STABLE_MIN_SCLK;
+    case AGA_GPU_PERF_LEVEL_MANUAL:
+        return amdgpu::GPU_PERF_LEVEL_MANUAL;
+    case AGA_GPU_PERF_LEVEL_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_PERF_LEVEL_NONE;
+}
+
+static inline amdgpu::GPUClockType
+aga_gpu_clock_type_to_proto (aga_gpu_clock_type_t type)
+{
+    switch (type) {
+    case AGA_GPU_CLOCK_TYPE_FABRIC:
+        return amdgpu::GPU_CLOCK_TYPE_FABRIC;
+    case AGA_GPU_CLOCK_TYPE_MEMORY:
+        return amdgpu::GPU_CLOCK_TYPE_MEMORY;
+    case AGA_GPU_CLOCK_TYPE_SYSTEM:
+        return amdgpu::GPU_CLOCK_TYPE_SYSTEM;
+    case AGA_GPU_CLOCK_TYPE_SOC:
+        return amdgpu::GPU_CLOCK_TYPE_SOC;
+    case AGA_GPU_CLOCK_TYPE_DCE:
+        return amdgpu::GPU_CLOCK_TYPE_DCE;
+    case AGA_GPU_CLOCK_TYPE_PCIE:
+        return amdgpu::GPU_CLOCK_TYPE_PCIE;
+    case AGA_GPU_CLOCK_TYPE_VIDEO:
+        return amdgpu::GPU_CLOCK_TYPE_VIDEO;
+    case AGA_GPU_CLOCK_TYPE_DATA:
+        return amdgpu::GPU_CLOCK_TYPE_DATA;
+    case AGA_GPU_CLOCK_TYPE_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_CLOCK_TYPE_NONE;
+}
+
+static inline amdgpu::GPUThrottlingStatus
+aga_gpu_throttling_status_to_proto (aga_gpu_throttling_status_t status)
+{
+    switch (status) {
+    case AGA_GPU_THROTTLING_STATUS_OFF:
+        return amdgpu::GPU_THROTTLING_STATUS_OFF;
+    case AGA_GPU_THROTTLING_STATUS_ON:
+        return amdgpu::GPU_THROTTLING_STATUS_ON;
+    case AGA_GPU_THROTTLING_STATUS_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_THROTTLING_STATUS_NONE;
+}
+
+static inline amdgpu::GPUVirtualizationMode
+aga_gpu_virtualization_mode_to_proto (aga_gpu_virtualization_mode_t mode)
+{
+    switch (mode) {
+    case AGA_VIRTUALIZATION_MODE_BAREMETAL:
+        return amdgpu::GPU_VIRTUALIZATION_MODE_BAREMETAL;
+    case AGA_VIRTUALIZATION_MODE_HOST:
+        return amdgpu::GPU_VIRTUALIZATION_MODE_HOST;
+    case AGA_VIRTUALIZATION_MODE_GUEST:
+        return amdgpu::GPU_VIRTUALIZATION_MODE_GUEST;
+    case AGA_VIRTUALIZATION_MODE_PASSTHROUGH:
+        return amdgpu::GPU_VIRTUALIZATION_MODE_PASSTHROUGH;
+    default:
+        break;
+    }
+    return amdgpu::GPU_VIRTUALIZATION_MODE_NONE;
+}
+
+static inline void
+aga_gpu_clock_spec_to_proto (GPUClockFrequencyRange *proto_spec,
+                             const aga_gpu_clock_freq_range_t *spec)
+{
+    proto_spec->set_clocktype(aga_gpu_clock_type_to_proto(spec->clock_type));
+    proto_spec->set_lowfrequency(spec->lo);
+    proto_spec->set_highfrequency(spec->hi);
+}
+
+static inline amdgpu::GPUComputePartitionType
+aga_gpu_compute_partition_type_to_proto (aga_gpu_compute_partition_type_t type)
+{
+    switch (type) {
+    case AGA_GPU_COMPUTE_PARTITION_TYPE_SPX:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_SPX;
+    case AGA_GPU_COMPUTE_PARTITION_TYPE_DPX:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_DPX;
+    case AGA_GPU_COMPUTE_PARTITION_TYPE_TPX:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_TPX;
+    case AGA_GPU_COMPUTE_PARTITION_TYPE_QPX:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_QPX;
+    case AGA_GPU_COMPUTE_PARTITION_TYPE_CPX:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_CPX;
+    default:
+        return amdgpu::GPU_COMPUTE_PARTITION_TYPE_NONE;
+    }
+}
+
+static inline amdgpu::GPUMemoryPartitionType
+aga_gpu_memory_partition_type_to_proto (aga_gpu_memory_partition_type_t type)
+{
+    switch (type) {
+    case AGA_GPU_MEMORY_PARTITION_TYPE_NPS1:
+        return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS1;
+    case AGA_GPU_MEMORY_PARTITION_TYPE_NPS2:
+        return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS2;
+    case AGA_GPU_MEMORY_PARTITION_TYPE_NPS4:
+        return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS4;
+    case AGA_GPU_MEMORY_PARTITION_TYPE_NPS8:
+        return amdgpu::GPU_MEMORY_PARTITION_TYPE_NPS8;
+    default:
+        return amdgpu::GPU_MEMORY_PARTITION_TYPE_NONE;
+    }
+}
+
+// populate proto buf spec from gpu API spec
+static inline void
+aga_gpu_api_spec_to_proto (GPUSpec *proto_spec,
+                           const aga_gpu_spec_t *spec)
+{
+    proto_spec->set_id(spec->key.id, OBJ_MAX_KEY_LEN);
+    proto_spec->set_adminstate(aga_gpu_admin_state_to_proto(spec->admin_state));
+    proto_spec->set_overdrivelevel(spec->overdrive_level);
+    proto_spec->set_gpupowercap(spec->gpu_power_cap);
+    proto_spec->set_performancelevel(aga_gpu_perf_level_to_proto(
+                                         spec->perf_level));
+    for (uint32_t i = 0; i < spec->num_clock_freqs; i++) {
+        aga_gpu_clock_spec_to_proto(proto_spec->add_clockfrequency(),
+                                    &spec->clock_freq[i]);
+    }
+    proto_spec->set_fanspeed(spec->fan_speed);
+    proto_spec->set_computepartitiontype(
+                    aga_gpu_compute_partition_type_to_proto(
+                        spec->compute_partition_type));
+    proto_spec->set_memorypartitiontype(
+                    aga_gpu_memory_partition_type_to_proto(
+                        spec->memory_partition_type));
+    // TODO: fill gpu RAS spec
+}
+
+static inline void
+aga_gpu_fw_version_to_proto (GPUStatus *proto_status,
+                             const aga_gpu_status_t *status)
+{
+    for (uint32_t i = 0; i < status->num_fw_versions; i++) {
+        auto fw_ver = proto_status->add_firmwareversion();
+        fw_ver->set_firmware(status->fw_version[i].firmware);
+        fw_ver->set_version(status->fw_version[i].version);
+    }
+}
+
+static inline void
+aga_gpu_clock_status_to_proto (GPUStatus *proto_status,
+                               const aga_gpu_status_t *status)
+{
+    for (uint32_t i = 0; i < status->num_clock_status; i++) {
+        auto clk_status = proto_status->add_clockstatus();
+        clk_status->set_type(aga_gpu_clock_type_to_proto(
+                                 status->clock_status[i].clock_type));
+        clk_status->set_frequency(status->clock_status[i].frequency);
+        clk_status->set_lowfrequency(status->clock_status[i].low_frequency);
+        clk_status->set_highfrequency(status->clock_status[i].high_frequency);
+        clk_status->set_locked(status->clock_status[i].locked);
+        clk_status->set_deepsleep(status->clock_status[i].deep_sleep);
+    }
+}
+static inline void
+aga_gpu_voltage_curve_point_to_proto (GPUStatus *proto_status,
+                                      const aga_gpu_status_t *status)
+{
+    // voltage-curve-point proto message is currently not defined in status
+}
+
+static inline amdgpu::GPUOperStatus
+aga_gpu_oper_status_to_proto (aga_gpu_oper_state_t oper_status)
+{
+    switch (oper_status) {
+    case AGA_GPU_OPER_STATE_UP:
+        return amdgpu::GPU_OPER_STATUS_UP;
+    case AGA_GPU_OPER_STATE_DOWN:
+        return amdgpu::GPU_OPER_STATUS_DOWN;
+    case AGA_GPU_OPER_STATE_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_OPER_STATUS_NONE;
+}
+
+static inline amdgpu::GPUXGMIErrorStatus
+aga_gpu_xgmi_error_status_to_proto (aga_gpu_xgmi_error_status_t xgmi_status)
+{
+    switch (xgmi_status) {
+    case AGA_GPU_XGMI_STATUS_NO_ERROR:
+        return amdgpu::GPU_XGMI_STATUS_NO_ERROR;
+    case AGA_GPU_XGMI_STATUS_ONE_ERROR:
+        return amdgpu::GPU_XGMI_STATUS_ONE_ERROR;
+    case AGA_GPU_XGMI_STATUS_MULTIPLE_ERROR:
+        return amdgpu::GPU_XGMI_STATUS_MULTIPLE_ERROR;
+    default:
+        break;
+    }
+    return amdgpu::GPU_XGMI_STATUS_NONE;
+}
+
+static inline void
+aga_gpu_xgmi_status_to_proto (amdgpu::GPUXGMIStatus *proto_status,
+                              const aga_gpu_xgmi_status_t *status)
+{
+    proto_status->set_errorstatus(
+        aga_gpu_xgmi_error_status_to_proto(status->error_status));
+    proto_status->set_width(status->width);
+    proto_status->set_speed(status->speed);
+}
+
+static inline amdgpu::PCIeSlotType
+aga_gpu_pcie_slot_type_to_proto (aga_pcie_slot_type_t slot_type)
+{
+    switch (slot_type) {
+    case AGA_PCIE_SLOT_TYPE_PCIE:
+        return amdgpu::PCIE_SLOT_TYPE_PCIE;
+    case AGA_PCIE_SLOT_TYPE_OAM:
+        return amdgpu::PCIE_SLOT_TYPE_OAM;
+    case AGA_PCIE_SLOT_TYPE_CEM:
+        return amdgpu::PCIE_SLOT_TYPE_CEM;
+    case AGA_PCIE_SLOT_TYPE_UNKNOWN:
+        return amdgpu::PCIE_SLOT_TYPE_UNKNOWN;
+    default:
+        return amdgpu::PCIE_SLOT_TYPE_NONE;
+    }
+}
+
+// populte PCIe status proto
+static inline void
+aga_gpu_pcie_status_to_proto (GPUPCIeStatus *proto_status,
+                              const aga_gpu_pcie_status_t *status)
+{
+    proto_status->set_slottype(
+                      aga_gpu_pcie_slot_type_to_proto(status->slot_type));
+    proto_status->set_pciebusid(status->pcie_bus_id);
+    proto_status->set_maxwidth(status->max_width);
+    proto_status->set_maxspeed(status->max_speed);
+    proto_status->set_version(status->version);
+    proto_status->set_width(status->width);
+    proto_status->set_speed(status->speed);
+    proto_status->set_bandwidth(status->bandwidth);
+}
+
+static inline amdgpu::VRAMType
+aga_gpu_vram_type_to_proto (aga_vram_type_t type)
+{
+    switch (type) {
+    case AGA_VRAM_TYPE_HBM:
+        return amdgpu::VRAM_TYPE_HBM;
+    case AGA_VRAM_TYPE_HBM2:
+        return amdgpu::VRAM_TYPE_HBM2;
+    case AGA_VRAM_TYPE_HBM2E:
+        return amdgpu::VRAM_TYPE_HBM2E;
+    case AGA_VRAM_TYPE_HBM3:
+        return amdgpu::VRAM_TYPE_HBM3;
+    case AGA_VRAM_TYPE_DDR2:
+        return amdgpu::VRAM_TYPE_DDR2;
+    case AGA_VRAM_TYPE_DDR3:
+        return amdgpu::VRAM_TYPE_DDR3;
+    case AGA_VRAM_TYPE_DDR4:
+        return amdgpu::VRAM_TYPE_DDR4;
+    case AGA_VRAM_TYPE_GDDR1:
+        return amdgpu::VRAM_TYPE_GDDR1;
+    case AGA_VRAM_TYPE_GDDR2:
+        return amdgpu::VRAM_TYPE_GDDR2;
+    case AGA_VRAM_TYPE_GDDR3:
+        return amdgpu::VRAM_TYPE_GDDR3;
+    case AGA_VRAM_TYPE_GDDR4:
+        return amdgpu::VRAM_TYPE_GDDR4;
+    case AGA_VRAM_TYPE_GDDR5:
+        return amdgpu::VRAM_TYPE_GDDR5;
+    case AGA_VRAM_TYPE_GDDR6:
+        return amdgpu::VRAM_TYPE_GDDR6;
+    case AGA_VRAM_TYPE_GDDR7:
+        return amdgpu::VRAM_TYPE_GDDR7;
+    case AGA_VRAM_TYPE_UNKNOWN:
+        return amdgpu::VRAM_TYPE_UNKNOWN;
+    default:
+        return amdgpu::VRAM_TYPE_NONE;
+    }
+}
+
+
+// populate VRAM status proto
+static inline void
+aga_gpu_vram_status_to_proto (GPUVRAMStatus *proto_status,
+                              const aga_gpu_vram_status_t *status)
+{
+    proto_status->set_type(aga_gpu_vram_type_to_proto(status->type));
+    proto_status->set_vendor(status->vendor);
+    proto_status->set_size(status->size);
+}
+
+static inline amdgpu::GPUPageStatus
+aga_gpu_page_status_to_proto (aga_gpu_page_status_t page_status)
+{
+    switch (page_status) {
+    case AGA_GPU_PAGE_STATUS_RESERVED:
+        return amdgpu::GPU_PAGE_STATUS_RESERVED;
+    case AGA_GPU_PAGE_STATUS_PENDING:
+        return amdgpu::GPU_PAGE_STATUS_PENDING;
+    case AGA_GPU_PAGE_STATUS_UNRESERVABLE:
+        return amdgpu::GPU_PAGE_STATUS_UNRESERVABLE;
+    case AGA_GPU_PAGE_STATUS_NONE:
+    default:
+        break;
+    }
+    return amdgpu::GPU_PAGE_STATUS_NONE;
+}
+
+// populate proto buf status from gpu status
+static inline void
+aga_gpu_api_status_to_proto (GPUStatus *proto_status,
+                             const aga_gpu_status_t *status)
+{
+    proto_status->set_index(status->index);
+    proto_status->set_gpuhandle((uint64_t)status->handle);
+    proto_status->set_serialnum(status->serial_num);
+    proto_status->set_cardseries(status->card_series);
+    proto_status->set_cardmodel(status->card_model);
+    proto_status->set_cardvendor(status->card_vendor);
+    proto_status->set_cardsku(status->card_sku);
+    proto_status->set_driverversion(status->driver_version);
+    proto_status->set_vbiosversion(status->vbios_version);
+    proto_status->set_vbiospartnumber(status->vbios_part_number);
+    aga_gpu_fw_version_to_proto(proto_status, status);
+    proto_status->set_memoryvendor(status->memory_vendor);
+    proto_status->set_operstatus(
+                      aga_gpu_oper_status_to_proto(status->oper_status));
+    aga_gpu_clock_status_to_proto(proto_status, status);
+    for (uint32_t i = 0;  i < status->num_kfd_process_id; i++) {
+        if (status->kfd_process_id[i]) {
+            // copy only non-zero process ids only
+            proto_status->add_kfdprocessid(status->kfd_process_id[i]);
+        }
+    }
+    // TODO: fill RAS status
+    aga_gpu_xgmi_status_to_proto(proto_status->mutable_xgmistatus(),
+                                 &status->xgmi_status);
+    aga_gpu_voltage_curve_point_to_proto(proto_status, status);
+    aga_gpu_vram_status_to_proto(proto_status->mutable_vramstatus(),
+                                 &status->vram_status);
+    aga_gpu_pcie_status_to_proto(proto_status->mutable_pciestatus(),
+                                 &status->pcie_status);
+    proto_status->set_throttlingstatus(aga_gpu_throttling_status_to_proto(
+                                          status->throttling_status));
+    proto_status->set_fwtimestamp(status->fw_timestamp);
+    proto_status->set_partitionid(status->partition_id);
+    proto_status->set_virtualizationmode(aga_gpu_virtualization_mode_to_proto(
+                                         status->virtualization_mode));
+    for (uint32_t i = 0; i < status->num_gpu_partition; i++) {
+        if (status->gpu_partition[i].valid()) {
+            proto_status->add_gpupartition(status->gpu_partition[i].id,
+                                           OBJ_MAX_KEY_LEN);
+        }
+    }
+    if (status->physical_gpu.valid()) {
+        proto_status->set_physicalgpu(status->physical_gpu.id, OBJ_MAX_KEY_LEN);
+    }
+    proto_status->set_kfdid(status->kfd_id);
+    proto_status->set_nodeid(status->node_id);
+    proto_status->set_drmrenderid(status->drm_render_id);
+    proto_status->set_drmcardid(status->drm_card_id);
+}
+
+// populate gpu bad page records proto buf
+static inline void
+aga_gpu_bad_page_api_info_to_proto (uint32_t num_bad_pages,
+                                    aga_gpu_bad_page_record_t *records,
+                                    void *ctxt)
+{
+    streaming_get_ctxt_t *get_ctxt;
+    GPUBadPageGetResponse *proto_rsp;
+    grpc::ServerWriter<GPUBadPageGetResponse> *writer;
+
+    get_ctxt = (streaming_get_ctxt_t *)ctxt;
+    proto_rsp = (GPUBadPageGetResponse *)get_ctxt->msg_ctxt;
+    writer = (grpc::ServerWriter<GPUBadPageGetResponse> *)get_ctxt->writer_ctxt;
+
+    for (uint32_t i = 0; i < num_bad_pages; i++) {
+        get_ctxt->count++;
+        auto proto_record = proto_rsp->add_record();
+        proto_record->set_gpu(records[i].key.id, OBJ_MAX_KEY_LEN);
+        proto_record->set_pageaddress(records[i].page_address);
+        proto_record->set_pagesize(records[i].page_size);
+        proto_record->set_pagestatus(
+                          aga_gpu_page_status_to_proto(records[i].page_status));
+        if (proto_rsp->record_size() == AGA_MAX_STREAMING_RSP_SIZE) {
+            proto_rsp->set_apistatus(sdk_ret_to_api_status(SDK_RET_OK));
+            proto_rsp->set_errorcode(sdk_ret_to_error_code(SDK_RET_OK));
+            if (!writer->Write(*proto_rsp)) {
+                AGA_TRACE_ERR("Failed to write gpu bad page info to gRPC "
+                              "stream");
+            }
+            proto_rsp->Clear();
+        }
+    }
+}
+
+// populate gpu compute partition get response proto buf
+static inline void
+aga_gpu_compute_partition_info_to_proto (
+    aga_gpu_compute_partition_info_t *info, void *ctxt)
+{
+    GPUComputePartitionGetResponse *proto_rsp =
+        (GPUComputePartitionGetResponse *)ctxt;
+
+    auto resp = proto_rsp->add_response();
+    resp->set_id(info->physical_gpu.id, OBJ_MAX_KEY_LEN);
+    resp->set_partitiontype(aga_gpu_compute_partition_type_to_proto(
+                                info->partition_type));
+    for (uint32_t i = 0; i < info->num_gpu_partition; i++) {
+        if (info->gpu_partition[i].valid()) {
+            resp->add_gpupartition(info->gpu_partition[i].id,
+                                   OBJ_MAX_KEY_LEN);
+        }
+    }
+}
+
+// populate gpu memory partition get response proto buf
+static inline void
+aga_gpu_memory_partition_info_to_proto (
+    aga_gpu_memory_partition_info_t *info, void *ctxt)
+{
+    GPUMemoryPartitionGetResponse *proto_rsp =
+        (GPUMemoryPartitionGetResponse *)ctxt;
+
+    auto resp = proto_rsp->add_response();
+    resp->set_id(info->physical_gpu.id, OBJ_MAX_KEY_LEN);
+    resp->set_partitiontype(aga_gpu_memory_partition_type_to_proto(
+                                info->partition_type));
+}
+
+// populate temperature proto buf stats from gpu stats
+static inline void
+aga_gpu_temp_stats_to_proto (amdgpu::GPUTemperatureStats *proto_stats,
+                             const aga_gpu_temperature_stats_t *stats)
+{
+    proto_stats->add_hbmtemperature(stats->hbm_temperature[0]);
+    proto_stats->add_hbmtemperature(stats->hbm_temperature[1]);
+    proto_stats->add_hbmtemperature(stats->hbm_temperature[2]);
+    proto_stats->add_hbmtemperature(stats->hbm_temperature[3]);
+    proto_stats->set_edgetemperature(stats->edge_temperature);
+    proto_stats->set_junctiontemperature(stats->junction_temperature);
+    proto_stats->set_memorytemperature(stats->memory_temperature);
+}
+
+// populate proto gpu usage stats from gpu stats
+static inline void
+aga_gpu_usage_stats_to_proto (GPUUsage *proto_stats,
+                              const aga_gpu_usage_t *stats)
+{
+    proto_stats->set_gfxactivity(stats->gfx_activity);
+    proto_stats->set_umcactivity(stats->umc_activity);
+    proto_stats->set_mmactivity(stats->mm_activity);
+    for (uint16_t i = 0; i < AGA_GPU_MAX_VCN; i++) {
+        proto_stats->add_vcnactivity(stats->vcn_activity[i]);
+        proto_stats->add_vcnbusyinst(stats->vcn_busy[i]);
+    }
+    for (uint16_t i = 0; i < AGA_GPU_MAX_JPEG; i++) {
+        proto_stats->add_jpegactivity(stats->jpeg_activity[i]);
+    }
+    for (uint16_t i = 0; i < AGA_GPU_MAX_JPEG_ENG; i++) {
+        proto_stats->add_jpegbusyinst(stats->jpeg_busy[i]);
+    }
+    for (uint16_t i = 0; i < AGA_GPU_MAX_XCC; i++) {
+        proto_stats->add_gfxbusyinst(stats->gfx_busy_inst[i]);
+    }
+}
+
+// populate proto memory usage stats from gpu stats
+static inline void
+aga_gpu_memory_usage_stats_to_proto (GPUMemoryUsage *proto_stats,
+                                     const aga_gpu_memory_usage_t *stats)
+{
+    proto_stats->set_memoryusage(stats->memory_usage);
+    proto_stats->set_activity(stats->activity);
+}
+
+// populte PCIe stats proto
+static inline void
+aga_gpu_pcie_stats_to_proto (GPUPCIeStats *proto_stats,
+                             const aga_gpu_pcie_stats_t *stats)
+{
+    proto_stats->set_replaycount(stats->replay_count);
+    proto_stats->set_recoverycount(stats->recovery_count);
+    proto_stats->set_replayrollovercount(stats->replay_rollover_count);
+    proto_stats->set_nacksentcount(stats->nack_sent_count);
+    proto_stats->set_nackreceivedcount(stats->nack_received_count);
+    proto_stats->set_rxbytes(stats->rx_bytes);
+    proto_stats->set_txbytes(stats->tx_bytes);
+    proto_stats->set_bidirbandwidth(stats->bidir_bandwidth);
+}
+
+// populte VRAM usage stats proto
+static inline void
+aga_gpu_vram_usage_stats_to_proto (GPUVRAMUsage *proto_stats,
+                                   const aga_gpu_vram_usage_t *stats)
+{
+    proto_stats->set_totalvram(stats->total_vram);
+    proto_stats->set_usedvram(stats->used_vram);
+    proto_stats->set_freevram(stats->free_vram);
+    proto_stats->set_totalvisiblevram(stats->total_visible_vram);
+    proto_stats->set_usedvisiblevram(stats->used_visible_vram);
+    proto_stats->set_freevisiblevram(stats->free_visible_vram);
+    proto_stats->set_totalgtt(stats->total_gtt);
+    proto_stats->set_usedgtt(stats->used_gtt);
+    proto_stats->set_freegtt(stats->free_gtt);
+}
+
+// populte GPU voltage proto
+static inline void
+aga_gpu_voltage_to_proto (GPUVoltage *proto_stats,
+                          const aga_gpu_voltage_t *stats)
+{
+    proto_stats->set_voltage(stats->voltage);
+    proto_stats->set_gfxvoltage(stats->gfx_voltage);
+    proto_stats->set_memoryvoltage(stats->memory_voltage);
+}
+
+// populate GPU XGMI link statistics
+static inline void
+aga_gpu_xgmi_link_stats_to_proto (amdgpu::GPUXGMILinkStats *proto_stats,
+                                  const aga_gpu_xgmi_link_stats_t *stats)
+{
+    proto_stats->set_dataread(stats->data_read);
+    proto_stats->set_datawrite(stats->data_write);
+}
+
+// populate GPU violation statistics
+static inline void
+aga_gpu_violation_stats_to_proto (amdgpu::GPUViolationStats *proto_stats,
+                                  const aga_gpu_violation_stats_t *stats)
+{
+    proto_stats->set_currentaccumulatedcounter(
+                     stats->current_accumulated_counter);
+    proto_stats->set_processorhotresidencyaccumulated(
+                     stats->processor_hot_residency_accumulated);
+    proto_stats->set_pptresidencyaccumulated(
+                     stats->ppt_residency_accumulated);
+    proto_stats->set_socketthermalresidencyaccumulated(
+                     stats->socket_thermal_residency_accumulated);
+    proto_stats->set_vrthermalresidencyaccumulated(
+                     stats->vr_thermal_residency_accumulated);
+    proto_stats->set_hbmthermalresidencyaccumulated(
+                     stats->hbm_thermal_residency_accumulated);
+}
+
+// populate proto buf stats from gpu stats
+static inline void
+aga_gpu_api_stats_to_proto (GPUStats *proto_stats,
+                            const aga_gpu_stats_t *stats)
+{
+    proto_stats->set_packagepower(stats->package_power);
+    proto_stats->set_avgpackagepower(stats->avg_package_power);
+    aga_gpu_temp_stats_to_proto(proto_stats->mutable_temperature(),
+                                &stats->temperature);
+    aga_gpu_usage_stats_to_proto(proto_stats->mutable_usage(),
+                                 &stats->usage);
+    aga_gpu_voltage_to_proto(proto_stats->mutable_voltage(),
+                             &stats->voltage);
+    aga_gpu_pcie_stats_to_proto(proto_stats->mutable_pciestats(),
+                                &stats->pcie_stats);
+    aga_gpu_vram_usage_stats_to_proto(proto_stats->mutable_vramusage(),
+                                      &stats->vram_usage);
+    proto_stats->set_energyconsumed(stats->energy_consumed);
+    proto_stats->set_powerusage(stats->power_usage);
+    proto_stats->set_totalcorrectableerrors(stats->total_correctable_errors);
+    proto_stats->set_totaluncorrectableerrors(
+                     stats->total_uncorrectable_errors);
+    proto_stats->set_sdmacorrectableerrors(stats->sdma_correctable_errors);
+    proto_stats->set_sdmauncorrectableerrors(stats->sdma_uncorrectable_errors);
+    proto_stats->set_gfxcorrectableerrors(stats->gfx_correctable_errors);
+    proto_stats->set_gfxuncorrectableerrors(stats->gfx_uncorrectable_errors);
+    proto_stats->set_mmhubcorrectableerrors(stats->mmhub_correctable_errors);
+    proto_stats->set_mmhubuncorrectableerrors(
+                     stats->mmhub_uncorrectable_errors);
+    proto_stats->set_athubcorrectableerrors(stats->athub_correctable_errors);
+    proto_stats->set_athubuncorrectableerrors(
+                     stats->athub_uncorrectable_errors);
+    proto_stats->set_bifcorrectableerrors(stats->bif_correctable_errors);
+    proto_stats->set_bifuncorrectableerrors(stats->bif_uncorrectable_errors);
+    proto_stats->set_hdpcorrectableerrors(stats->hdp_correctable_errors);
+    proto_stats->set_hdpuncorrectableerrors(stats->hdp_uncorrectable_errors);
+    proto_stats->set_xgmiwaflcorrectableerrors(
+                     stats->xgmi_wafl_correctable_errors);
+    proto_stats->set_xgmiwafluncorrectableerrors(
+                     stats->xgmi_wafl_uncorrectable_errors);
+    proto_stats->set_dfcorrectableerrors(stats->df_correctable_errors);
+    proto_stats->set_dfuncorrectableerrors(stats->df_uncorrectable_errors);
+    proto_stats->set_smncorrectableerrors(stats->smn_correctable_errors);
+    proto_stats->set_smnuncorrectableerrors(stats->smn_uncorrectable_errors);
+    proto_stats->set_semcorrectableerrors(stats->sem_correctable_errors);
+    proto_stats->set_semuncorrectableerrors(stats->sem_uncorrectable_errors);
+    proto_stats->set_mp0correctableerrors(stats->mp0_correctable_errors);
+    proto_stats->set_mp0uncorrectableerrors(stats->mp0_uncorrectable_errors);
+    proto_stats->set_mp1correctableerrors(stats->mp1_correctable_errors);
+    proto_stats->set_mp1uncorrectableerrors(stats->mp1_uncorrectable_errors);
+    proto_stats->set_fusecorrectableerrors(stats->fuse_correctable_errors);
+    proto_stats->set_fuseuncorrectableerrors(stats->fuse_uncorrectable_errors);
+    proto_stats->set_umccorrectableerrors(stats->umc_correctable_errors);
+    proto_stats->set_umcuncorrectableerrors(stats->umc_uncorrectable_errors);
+    proto_stats->set_mcacorrectableerrors(stats->mca_correctable_errors);
+    proto_stats->set_mcauncorrectableerrors(stats->mca_uncorrectable_errors);
+    proto_stats->set_vcncorrectableerrors(stats->vcn_correctable_errors);
+    proto_stats->set_vcnuncorrectableerrors(stats->vcn_uncorrectable_errors);
+    proto_stats->set_jpegcorrectableerrors(stats->jpeg_correctable_errors);
+    proto_stats->set_jpeguncorrectableerrors(stats->jpeg_uncorrectable_errors);
+    proto_stats->set_ihcorrectableerrors(stats->ih_correctable_errors);
+    proto_stats->set_ihuncorrectableerrors(stats->ih_uncorrectable_errors);
+    proto_stats->set_mpiocorrectableerrors(stats->mpio_correctable_errors);
+    proto_stats->set_mpiouncorrectableerrors(stats->mpio_uncorrectable_errors);
+    proto_stats->set_xgmineighbor0txnops(stats->xgmi_neighbor0_tx_nops);
+    proto_stats->set_xgmineighbor0txrequests(stats->xgmi_neighbor0_tx_requests);
+    proto_stats->set_xgmineighbor0txresponses
+                     (stats->xgmi_neighbor0_tx_responses);
+    proto_stats->set_xgmineighbor0txbeats(stats->xgmi_neighbor0_tx_beats);
+    proto_stats->set_xgmineighbor1txnops(stats->xgmi_neighbor1_tx_nops);
+    proto_stats->set_xgmineighbor1txrequests(stats->xgmi_neighbor1_tx_requests);
+    proto_stats->set_xgmineighbor1txresponses
+                     (stats->xgmi_neighbor1_tx_responses);
+    proto_stats->set_xgmineighbor1txbeats(stats->xgmi_neighbor1_tx_beats);
+    proto_stats->set_xgmineighbor0txthroughput(
+                     stats->xgmi_neighbor0_tx_throughput);
+    proto_stats->set_xgmineighbor1txthroughput(
+                     stats->xgmi_neighbor1_tx_throughput);
+    proto_stats->set_xgmineighbor2txthroughput(
+                     stats->xgmi_neighbor2_tx_throughput);
+    proto_stats->set_xgmineighbor3txthroughput(
+                     stats->xgmi_neighbor3_tx_throughput);
+    proto_stats->set_xgmineighbor4txthroughput(
+                     stats->xgmi_neighbor4_tx_throughput);
+    proto_stats->set_xgmineighbor5txthroughput(
+                     stats->xgmi_neighbor5_tx_throughput);
+    proto_stats->set_fanspeed(stats->fan_speed);
+    proto_stats->set_gfxactivityaccumulated(stats->gfx_activity_accumulated);
+    proto_stats->set_memoryactivityaccumulated(stats->mem_activity_accumulated);
+    for (uint32_t i = 0; i < AGA_GPU_MAX_XGMI_LINKS; i++) {
+        aga_gpu_xgmi_link_stats_to_proto(proto_stats->add_xgmilinkstats(),
+                                         &stats->xgmi_link_stats[i]);
+    }
+    aga_gpu_violation_stats_to_proto(proto_stats->mutable_violationstats(),
+                                     &stats->violation_stats);
+}
+
+// populate proto buf from gpu info
+static inline void
+aga_gpu_api_info_to_proto (aga_gpu_info_t *info, void *ctxt)
+{
+    GPUGetResponse *proto_rsp = (GPUGetResponse *)ctxt;
+    auto gpu = proto_rsp->add_response();
+    GPUSpec *proto_spec = gpu->mutable_spec();
+    GPUStatus *proto_status = gpu->mutable_status();
+    GPUStats *proto_stats = gpu->mutable_stats();
+
+    aga_gpu_api_spec_to_proto(proto_spec, &info->spec);
+    aga_gpu_api_status_to_proto(proto_status, &info->status);
+    aga_gpu_api_stats_to_proto(proto_stats, &info->stats);
+}
+
+// convert aga cper severity to proto
+static inline amdgpu::CPERSeverity
+aga_cper_severity_to_proto (aga_cper_severity_t severity)
+{
+    switch (severity) {
+    case AGA_CPER_SEVERITY_NON_FATAL_UNCORRECTED:
+        return amdgpu::CPER_SEVERITY_NON_FATAL_UNCORRECTED;
+        break;
+    case AGA_CPER_SEVERITY_FATAL:
+        return amdgpu::CPER_SEVERITY_FATAL;
+        break;
+    case AGA_CPER_SEVERITY_NON_FATAL_CORRECTED:
+        return amdgpu::CPER_SEVERITY_NON_FATAL_CORRECTED;
+        break;
+    default:
+        break;
+    }
+    return amdgpu::CPER_SEVERITY_NONE;
+}
+
+// convert aga cper notification type to proto
+static inline amdgpu::CPERNotificationType
+aga_cper_notification_type_to_proto (aga_cper_notification_type_t ntfn_type)
+{
+    switch (ntfn_type) {
+    case AGA_CPER_NOTIFICATION_TYPE_CMC:
+        return amdgpu::CPER_NOTIFICATION_TYPE_CMC;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_CPE:
+        return amdgpu::CPER_NOTIFICATION_TYPE_CPE;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_MCE:
+        return amdgpu::CPER_NOTIFICATION_TYPE_MCE;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_PCIE:
+        return amdgpu::CPER_NOTIFICATION_TYPE_PCIE;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_INIT:
+        return amdgpu::CPER_NOTIFICATION_TYPE_INIT;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_NMI:
+        return amdgpu::CPER_NOTIFICATION_TYPE_NMI;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_BOOT:
+        return amdgpu::CPER_NOTIFICATION_TYPE_BOOT;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_DMAR:
+        return amdgpu::CPER_NOTIFICATION_TYPE_DMAR;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_SEA:
+        return amdgpu::CPER_NOTIFICATION_TYPE_SEA;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_SEI:
+        return amdgpu::CPER_NOTIFICATION_TYPE_SEI;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_PEI:
+        return amdgpu::CPER_NOTIFICATION_TYPE_PEI;
+        break;
+    case AGA_CPER_NOTIFICATION_TYPE_CXL_COMPONENT:
+        return amdgpu::CPER_NOTIFICATION_TYPE_CXL_COMPONENT;
+        break;
+    default:
+        break;
+    }
+    return amdgpu::CPER_NOTIFICATION_TYPE_NONE;
+}
+
+// populate gpu cper information proto buf
+static inline void
+aga_gpu_cper_api_info_to_proto (aga_cper_info_t *info,
+                                void *ctxt)
+{
+    GPUCPEREntry *cper;
+    GPUCPERGetResponse *proto_rsp = (GPUCPERGetResponse *)ctxt;
+
+    if (!info->num_cper_entry) {
+        return;
+    }
+    cper = proto_rsp->add_cper();
+    cper->set_gpu(info->gpu.id, OBJ_MAX_KEY_LEN);
+    for (uint32_t i = 0; i < info->num_cper_entry; i++) {
+        auto cper_entry = cper->add_cperentry();
+        cper_entry->set_recordid(info->cper_entry[i].record_id);
+        cper_entry->set_severity(
+            aga_cper_severity_to_proto(info->cper_entry[i].severity));
+        cper_entry->set_revision(info->cper_entry[i].revision);
+        cper_entry->set_timestamp(info->cper_entry[i].timestamp);
+        cper_entry->set_creatorid(info->cper_entry[i].creator_id);
+        cper_entry->set_notificationtype(
+            aga_cper_notification_type_to_proto(
+                info->cper_entry[i].notification_type));
+        for (uint32_t j = 0; j < info->cper_entry[i].num_af_id; j++) {
+            cper_entry->add_afid(info->cper_entry[i].af_id[j]);
+        }
+    }
+}
+
+#endif    // __AGA_SVC_GPU_TO_PROTO_HPP__