Skip to content

Commit

Permalink
EDAC/ghes: Setup DIMM label from DMI and use it in error reports
Browse files Browse the repository at this point in the history
The ghes driver reports errors with 'unknown label' even if the actual
DIMM label is known, e.g.:

 EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0
   module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

Fix this by using struct dimm_info's label string in error reports:

 EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0
   rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

The labels are initialized by reading the bank and device strings
from DMI. Now, the label information can also read from sysfs. E.g. a
ThunderX2 system will show the following:

  /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0
  /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0
  /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0
  /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0
  /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0
  /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0
  /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0
  /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0
  /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0
  /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0
  /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0
  /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0
  /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0
  /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0
  /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0
  /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0

Since dimm_labels can be rewritten, that label will be used in a later
error report:

  # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label
  # # some error injection here
  # dmesg | grep foobar
  [ 751.383533] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0
  module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM location:N0 DIMM_A0
  page:0x8c8dc74 offset:0x0 grain:1 syndrome:0x0 - APEI location:
  node:0 card:0 module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM
  location:N0 DIMM_A0 status(0x0000000000000400): Storage error in DRAM
  memory)

 [ bp: Remove curly brackets around a single if-statement in dimm_setup_label(). ]

Signed-off-by: Robert Richter <rrichter@marvell.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200528101307.23245-1-rrichter@marvell.com
  • Loading branch information
Robert Richter authored and suryasaimadhu committed Jun 16, 2020
1 parent b3a9e3b commit cb51a37
Showing 1 changed file with 24 additions and 11 deletions.
35 changes: 24 additions & 11 deletions drivers/edac/ghes_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,27 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
(*num_dimm)++;
}

static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
{
struct dimm_info *dimm;

mci_for_each_dimm(mci, dimm) {
if (dimm->smbios_handle == handle)
return dimm->idx;
return dimm;
}

return -1;
return NULL;
}

static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
{
const char *bank = NULL, *device = NULL;

dmi_memdev_name(handle, &bank, &device);

/* both strings must be non-zero */
if (bank && *bank && device && *device)
snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device);
}

static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
Expand Down Expand Up @@ -179,9 +190,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
dimm->dtype = DEV_UNKNOWN;
dimm->grain = 128; /* Likely, worse case */

/*
* FIXME: It shouldn't be hard to also fill the DIMM labels
*/
dimm_setup_label(dimm, entry->handle);

if (dimm->nr_pages) {
edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
Expand Down Expand Up @@ -228,7 +237,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
memset(e, 0, sizeof (*e));
e->error_count = 1;
e->grain = 1;
strcpy(e->label, "unknown label");
e->msg = pvt->msg;
e->other_detail = pvt->other_detail;
e->top_layer = -1;
Expand Down Expand Up @@ -345,7 +353,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
const char *bank = NULL, *device = NULL;
int index = -1;
struct dimm_info *dimm;

dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
if (bank != NULL && device != NULL)
Expand All @@ -354,13 +362,18 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
mem_err->mem_dev_handle);

index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
if (index >= 0)
e->top_layer = index;
dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
if (dimm) {
e->top_layer = dimm->idx;
strcpy(e->label, dimm->label);
}
}
if (p > e->location)
*(p - 1) = '\0';

if (!*e->label)
strcpy(e->label, "unknown memory");

/* All other fields are mapped on e->other_detail */
p = pvt->other_detail;
p += snprintf(p, sizeof(pvt->other_detail),
Expand Down

0 comments on commit cb51a37

Please sign in to comment.