Skip to content

Commit

Permalink
Add support for Intel Granite Rapids and Sierra Forrest
Browse files Browse the repository at this point in the history
  • Loading branch information
TomTheBear committed Oct 30, 2024
1 parent 04ef0ca commit 1baff51
Show file tree
Hide file tree
Showing 22 changed files with 6,361 additions and 493 deletions.
99 changes: 99 additions & 0 deletions groups/GNR/MEM.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
SHORT Memory bandwidth in MBytes/s

EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
MBOX0C0 CAS_COUNT_SCH0_RD
MBOX0C1 CAS_COUNT_SCH0_WR
MBOX0C2 CAS_COUNT_SCH1_RD
MBOX0C3 CAS_COUNT_SCH1_WR
MBOX1C0 CAS_COUNT_SCH0_RD
MBOX1C1 CAS_COUNT_SCH0_WR
MBOX1C2 CAS_COUNT_SCH1_RD
MBOX1C3 CAS_COUNT_SCH1_WR
MBOX2C0 CAS_COUNT_SCH0_RD
MBOX2C1 CAS_COUNT_SCH0_WR
MBOX2C2 CAS_COUNT_SCH1_RD
MBOX2C3 CAS_COUNT_SCH1_WR
MBOX3C0 CAS_COUNT_SCH0_RD
MBOX3C1 CAS_COUNT_SCH0_WR
MBOX3C2 CAS_COUNT_SCH1_RD
MBOX3C3 CAS_COUNT_SCH1_WR
MBOX4C0 CAS_COUNT_SCH0_RD
MBOX4C1 CAS_COUNT_SCH0_WR
MBOX4C2 CAS_COUNT_SCH1_RD
MBOX4C3 CAS_COUNT_SCH1_WR
MBOX5C0 CAS_COUNT_SCH0_RD
MBOX5C1 CAS_COUNT_SCH0_WR
MBOX5C2 CAS_COUNT_SCH1_RD
MBOX5C3 CAS_COUNT_SCH1_WR
MBOX6C0 CAS_COUNT_SCH0_RD
MBOX6C1 CAS_COUNT_SCH0_WR
MBOX6C2 CAS_COUNT_SCH1_RD
MBOX6C3 CAS_COUNT_SCH1_WR
MBOX7C0 CAS_COUNT_SCH0_RD
MBOX7C1 CAS_COUNT_SCH0_WR
MBOX7C2 CAS_COUNT_SCH1_RD
MBOX7C3 CAS_COUNT_SCH1_WR
MBOX8C0 CAS_COUNT_SCH0_RD
MBOX8C1 CAS_COUNT_SCH0_WR
MBOX8C2 CAS_COUNT_SCH1_RD
MBOX8C3 CAS_COUNT_SCH1_WR
MBOX9C0 CAS_COUNT_SCH0_RD
MBOX9C1 CAS_COUNT_SCH0_WR
MBOX9C2 CAS_COUNT_SCH1_RD
MBOX9C3 CAS_COUNT_SCH1_WR
MBOX10C0 CAS_COUNT_SCH0_RD
MBOX10C1 CAS_COUNT_SCH0_WR
MBOX10C2 CAS_COUNT_SCH1_RD
MBOX10C3 CAS_COUNT_SCH1_WR
MBOX11C0 CAS_COUNT_SCH0_RD
MBOX11C1 CAS_COUNT_SCH0_WR
MBOX11C2 CAS_COUNT_SCH1_RD
MBOX11C3 CAS_COUNT_SCH1_WR
MBOX12C0 CAS_COUNT_SCH0_RD
MBOX12C1 CAS_COUNT_SCH0_WR
MBOX12C2 CAS_COUNT_SCH1_RD
MBOX12C3 CAS_COUNT_SCH1_WR
MBOX13C0 CAS_COUNT_SCH0_RD
MBOX13C1 CAS_COUNT_SCH0_WR
MBOX13C2 CAS_COUNT_SCH1_RD
MBOX13C3 CAS_COUNT_SCH1_WR
MBOX14C0 CAS_COUNT_SCH0_RD
MBOX14C1 CAS_COUNT_SCH0_WR
MBOX14C2 CAS_COUNT_SCH1_RD
MBOX14C3 CAS_COUNT_SCH1_WR
MBOX15C0 CAS_COUNT_SCH0_RD
MBOX15C1 CAS_COUNT_SCH0_WR
MBOX15C2 CAS_COUNT_SCH1_RD
MBOX15C3 CAS_COUNT_SCH1_WR


METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C2+MBOX1C0+MBOX1C2+MBOX2C0+MBOX2C2+MBOX3C0+MBOX3C2+MBOX4C0+MBOX4C2+MBOX5C0+MBOX5C2+MBOX6C0+MBOX6C2+MBOX7C0+MBOX7C2+MBOX8C0+MBOX8C2+MBOX9C0+MBOX9C2+MBOX10C0+MBOX10C2+MBOX11C0+MBOX11C2+MBOX12C0+MBOX12C2+MBOX13C0+MBOX13C2+MBOX14C0+MBOX14C2+MBOX15C0+MBOX15C2)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C2+MBOX1C0+MBOX1C2+MBOX2C0+MBOX2C2+MBOX3C0+MBOX3C2+MBOX4C0+MBOX4C2+MBOX5C0+MBOX5C2+MBOX6C0+MBOX6C2+MBOX7C0+MBOX7C2+MBOX8C0+MBOX8C2+MBOX9C0+MBOX9C2+MBOX10C0+MBOX10C2+MBOX11C0+MBOX11C2+MBOX12C0+MBOX12C2+MBOX13C0+MBOX13C2+MBOX14C0+MBOX14C2+MBOX15C0+MBOX15C2)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C3+MBOX1C1+MBOX1C3+MBOX2C1+MBOX2C3+MBOX3C1+MBOX3C3+MBOX4C1+MBOX4C3+MBOX5C1+MBOX5C3+MBOX6C1+MBOX6C3+MBOX7C1+MBOX7C3+MBOX8C1+MBOX8C3+MBOX9C1+MBOX9C3+MBOX10C1+MBOX10C3+MBOX11C1+MBOX11C3+MBOX12C1+MBOX12C3+MBOX13C1+MBOX13C3+MBOX14C1+MBOX14C3+MBOX15C1+MBOX15C3)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C3+MBOX1C1+MBOX1C3+MBOX2C1+MBOX2C3+MBOX3C1+MBOX3C3+MBOX4C1+MBOX4C3+MBOX5C1+MBOX5C3+MBOX6C1+MBOX6C3+MBOX7C1+MBOX7C3+MBOX8C1+MBOX8C3+MBOX9C1+MBOX9C3+MBOX10C1+MBOX10C3+MBOX11C1+MBOX11C3+MBOX12C1+MBOX12C3+MBOX13C1+MBOX13C3+MBOX14C1+MBOX14C3+MBOX15C1+MBOX15C3)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C2+MBOX1C0+MBOX1C2+MBOX2C0+MBOX2C2+MBOX3C0+MBOX3C2+MBOX4C0+MBOX4C2+MBOX5C0+MBOX5C2+MBOX6C0+MBOX6C2+MBOX7C0+MBOX7C2+MBOX8C0+MBOX8C2+MBOX9C0+MBOX9C2+MBOX10C0+MBOX10C2+MBOX11C0+MBOX11C2+MBOX12C0+MBOX12C2+MBOX13C0+MBOX13C2+MBOX14C0+MBOX14C2+MBOX15C0+MBOX15C2+MBOX0C1+MBOX0C3+MBOX1C1+MBOX1C3+MBOX2C1+MBOX2C3+MBOX3C1+MBOX3C3+MBOX4C1+MBOX4C3+MBOX5C1+MBOX5C3+MBOX6C1+MBOX6C3+MBOX7C1+MBOX7C3+MBOX8C1+MBOX8C3+MBOX9C1+MBOX9C3+MBOX10C1+MBOX10C3+MBOX11C1+MBOX11C3+MBOX12C1+MBOX12C3+MBOX13C1+MBOX13C3+MBOX14C1+MBOX14C3+MBOX15C1+MBOX15C3)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C2+MBOX1C0+MBOX1C2+MBOX2C0+MBOX2C2+MBOX3C0+MBOX3C2+MBOX4C0+MBOX4C2+MBOX5C0+MBOX5C2+MBOX6C0+MBOX6C2+MBOX7C0+MBOX7C2+MBOX8C0+MBOX8C2+MBOX9C0+MBOX9C2+MBOX10C0+MBOX10C2+MBOX11C0+MBOX11C2+MBOX12C0+MBOX12C2+MBOX13C0+MBOX13C2+MBOX14C0+MBOX14C2+MBOX15C0+MBOX15C2+MBOX0C1+MBOX0C3+MBOX1C1+MBOX1C3+MBOX2C1+MBOX2C3+MBOX3C1+MBOX3C3+MBOX4C1+MBOX4C3+MBOX5C1+MBOX5C3+MBOX6C1+MBOX6C3+MBOX7C1+MBOX7C3+MBOX8C1+MBOX8C3+MBOX9C1+MBOX9C3+MBOX10C1+MBOX10C3+MBOX11C1+MBOX11C3+MBOX12C1+MBOX12C3+MBOX13C1+MBOX13C3+MBOX14C1+MBOX14C3+MBOX15C1+MBOX15C3)*64.0

LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_SCH0_RD)+SUM(CAS_COUNT_SCH1_RD))*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_SCH0_RD)+SUM(CAS_COUNT_SCH1_RD))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_SCH0_WR)+SUM(CAS_COUNT_SCH1_WR))*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_SCH0_WR)+SUM(CAS_COUNT_SCH1_WR))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_SCH0_RD)+SUM(CAS_COUNT_SCH1_RD)+SUM(CAS_COUNT_SCH0_WR)+SUM(CAS_COUNT_SCH1_WR))*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_SCH0_RD)+SUM(CAS_COUNT_SCH1_RD)+SUM(CAS_COUNT_SCH0_WR)+SUM(CAS_COUNT_SCH1_WR))*64.0
--
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Since this group is based on Uncore events it is only possible to measure on a
per socket base. Some of the counters may not be available on your system.
Also outputs total data volume transferred from main memory.

35 changes: 25 additions & 10 deletions src/access-daemon/accessDaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
#include <perfmon_icelakeX_counters.h>
#include <intel_perfmon_uncore_discovery.h>
#include <perfmon_sapphirerapids_counters.h>
#include <perfmon_graniterapids_counters.h>
#include <perfmon_sierraforrest_counters.h>
#include <topology.h>
#include <cpuid.h>
#include <lock.h>
Expand Down Expand Up @@ -3110,7 +3112,8 @@ static void spr_read_unit(AccessDataRecord *record)
{
case ACCESS_TYPE_MMIO:
if ((record->device >= MMIO_IMC_DEVICE_0_CH_0 && record->device <= MMIO_IMC_DEVICE_1_CH_7) ||
(record->device >= MMIO_HBM_DEVICE_0 && record->device <= MMIO_HBM_DEVICE_31))
(record->device >= MMIO_HBM_DEVICE_0 && record->device <= MMIO_HBM_DEVICE_31) ||
(record->device >= PCI_R2PCIE_DEVICE0 && record->device <= PCI_R2PCIE_DEVICE31))
{
record->data = (uint32_t)*((uint32_t *)(unit->io_addr + unit->mmap_offset + unit->ctrl_offset + (sizeof(uint32_t) * offset)));
}
Expand Down Expand Up @@ -3370,7 +3373,8 @@ static void spr_write_unit(AccessDataRecord *record)
{
case ACCESS_TYPE_MMIO:
if ((record->device >= MMIO_IMC_DEVICE_0_CH_0 && record->device <= MMIO_IMC_DEVICE_1_CH_7) ||
(record->device >= MMIO_HBM_DEVICE_0 && record->device <= MMIO_HBM_DEVICE_31))
(record->device >= MMIO_HBM_DEVICE_0 && record->device <= MMIO_HBM_DEVICE_31) ||
(record->device >= PCI_R2PCIE_DEVICE0 && record->device <= PCI_R2PCIE_DEVICE31))
{
*((uint32_t *)(unit->io_addr + unit->mmap_offset + unit->ctrl_offset + (sizeof(uint32_t) * offset))) = (uint32_t)record->data;
}
Expand Down Expand Up @@ -3484,7 +3488,7 @@ static void spr_write_unit(AccessDataRecord *record)

static void spr_read(AccessDataRecord *record)
{
if (record->device == MSR_UBOX_DEVICE && ((record->reg == FAKE_UNC_GLOBAL_CTRL) || ((record->reg >= FAKE_UNC_GLOBAL_STATUS0) && (record->reg <= FAKE_UNC_GLOBAL_STATUS3))))
if (record->device == MSR_UBOX_DEVICE && ((record->reg == FAKE_UNC_GLOBAL_CTRL) || ((record->reg >= FAKE_UNC_GLOBAL_STATUS0) && (record->reg <= FAKE_UNC_GLOBAL_STATUS8))))
{
spr_read_global(record);
}
Expand All @@ -3496,7 +3500,7 @@ static void spr_read(AccessDataRecord *record)

static void spr_write(AccessDataRecord *record)
{
if (record->device == MSR_UBOX_DEVICE && ((record->reg == FAKE_UNC_GLOBAL_CTRL) || ((record->reg >= FAKE_UNC_GLOBAL_STATUS0) && (record->reg <= FAKE_UNC_GLOBAL_STATUS3))))
if (record->device == MSR_UBOX_DEVICE && ((record->reg == FAKE_UNC_GLOBAL_CTRL) || ((record->reg >= FAKE_UNC_GLOBAL_STATUS0) && (record->reg <= FAKE_UNC_GLOBAL_STATUS8))))
{
spr_write_global(record);
}
Expand Down Expand Up @@ -3696,6 +3700,22 @@ int main(void)
allowedPci = allowed_pci_spr;
isIntelUncoreDiscovery = 1;
}
else if (model == GRANITERAPIDS)
{
isPCIUncore = 1;
allowed = allowed_spr;
isPCI64 = 1;
allowedPci = allowed_pci_spr;
isIntelUncoreDiscovery = 1;
}
else if (model == SIERRAFORREST)
{
isPCIUncore = 1;
allowed = allowed_spr;
isPCI64 = 1;
allowedPci = allowed_pci_spr;
isIntelUncoreDiscovery = 1;
}
else if ((model == ATOM_SILVERMONT_C) ||
(model == ATOM_SILVERMONT_E) ||
(model == ATOM_SILVERMONT_Z1) ||
Expand All @@ -3712,11 +3732,6 @@ int main(void)
isPCIUncore = 1;
allowedPci = allowed_pci_knl;
}
else if (model == SAPPHIRERAPIDS)
{
allowed = allowed_icx;
isPCI64 = 1;
}
break;
case K8_FAMILY:
case K10_FAMILY:
Expand Down Expand Up @@ -3903,7 +3918,7 @@ int main(void)
else if (isIntelUncoreDiscovery)
{
pci_devices_daemon = NULL;
int err = perfmon_uncore_discovery(&perfmon_discovery);
int err = perfmon_uncore_discovery(model, &perfmon_discovery);
if (err < 0)
{
syslog(LOG_ERR, "Failed to run uncore discovery");
Expand Down
14 changes: 8 additions & 6 deletions src/access_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#include <access_x86_translate.h>
#include <affinity.h>

#define ARCH_SPR_GNR_SRF ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS || cpuid_info.model == GRANITERAPIDS || cpuid_info.model == SIERRAFORREST))

/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */

int
Expand All @@ -60,7 +62,7 @@ access_x86_init(int cpu_id)
{
if (cpuid_info.supportUncore)
{
if (!((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS)))
if (!(ARCH_SPR_GNR_SRF))
{
ret = access_x86_pci_init(affinity_thread2socket_lookup[cpu_id]);
}
Expand All @@ -72,7 +74,7 @@ access_x86_init(int cpu_id)
ERROR_PRINT(Initialization of MMIO access failed);
}
}
else if ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS))
else if (ARCH_SPR_GNR_SRF)
{
ret = access_x86_translate_init(cpu_id);
}
Expand Down Expand Up @@ -109,7 +111,7 @@ access_x86_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *da
*data = tmp;
}
}
else if ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS))
else if (ARCH_SPR_GNR_SRF)
{
if (access_x86_translate_check(dev, cpu_id))
{
Expand Down Expand Up @@ -158,7 +160,7 @@ access_x86_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t da
err = access_x86_mmio_write(dev, affinity_thread2socket_lookup[cpu_id], reg, data);
}
}
else if ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS))
else if (ARCH_SPR_GNR_SRF)
{
if (access_x86_translate_check(dev, cpu_id))
{
Expand Down Expand Up @@ -198,7 +200,7 @@ access_x86_finalize(int cpu_id)
DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize of MMIO access);
access_x86_mmio_finalize(affinity_thread2socket_lookup[cpu_id]);
}
else if ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS))
else if (ARCH_SPR_GNR_SRF)
{
DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize of Fake access);
access_x86_translate_finalize(cpu_id);
Expand Down Expand Up @@ -227,7 +229,7 @@ access_x86_check(PciDeviceIndex dev, int cpu_id)
{
return access_x86_mmio_check(dev, affinity_thread2socket_lookup[cpu_id]);
}
else if ((cpuid_info.family == P6_FAMILY) && (cpuid_info.model == SAPPHIRERAPIDS))
else if (ARCH_SPR_GNR_SRF)
{
return access_x86_translate_check(dev, cpu_id);
}
Expand Down
Loading

0 comments on commit 1baff51

Please sign in to comment.