Skip to content

Commit

Permalink
EDAC, amd64: Define and register UMC error decode function
Browse files Browse the repository at this point in the history
How we need to decode UMC errors is different from how we decode bus
errors, so let's define a new function for this. We also need a way to
determine the UMC channel since we're not guaranteed that there is a
fixed relation between channel and MCA bank.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/1480359593-80369-1-git-send-email-Yazen.Ghannam@amd.com
[ Fold in decode_synd_reg(), simplify. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
  • Loading branch information
yghannam authored and suryasaimadhu committed Nov 29, 2016
1 parent d27f3a3 commit 713ad54
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 3 deletions.
89 changes: 86 additions & 3 deletions drivers/edac/amd64_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -2392,7 +2392,13 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
string = "Failed to map error addr to a csrow";
break;
case ERR_CHANNEL:
string = "unknown syndrome - possible error reporting race";
string = "Unknown syndrome - possible error reporting race";
break;
case ERR_SYND:
string = "MCA_SYND not valid - unknown syndrome and csrow";
break;
case ERR_NORM_ADDR:
string = "Cannot decode normalized address";
break;
default:
string = "WTF error";
Expand Down Expand Up @@ -2441,6 +2447,76 @@ static inline void decode_bus_error(int node_id, struct mce *m)
__log_ecc_error(mci, &err, ecc_type);
}

/*
* To find the UMC channel represented by this bank we need to match on its
* instance_id. The instance_id of a bank is held in the lower 32 bits of its
* IPID.
*/
static int find_umc_channel(struct amd64_pvt *pvt, struct mce *m)
{
u32 umc_instance_id[] = {0x50f00, 0x150f00};
u32 instance_id = m->ipid & GENMASK(31, 0);
int i, channel = -1;

for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
if (umc_instance_id[i] == instance_id)
channel = i;

return channel;
}

static void decode_umc_error(int node_id, struct mce *m)
{
u8 ecc_type = (m->status >> 45) & 0x3;
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
struct err_info err;
u64 sys_addr;

mci = edac_mc_find(node_id);
if (!mci)
return;

pvt = mci->pvt_info;

memset(&err, 0, sizeof(err));

if (m->status & MCI_STATUS_DEFERRED)
ecc_type = 3;

err.channel = find_umc_channel(pvt, m);
if (err.channel < 0) {
err.err_code = ERR_CHANNEL;
goto log_error;
}

if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
err.err_code = ERR_NORM_ADDR;
goto log_error;
}

error_address_to_page_and_offset(sys_addr, &err);

if (!(m->status & MCI_STATUS_SYNDV)) {
err.err_code = ERR_SYND;
goto log_error;
}

if (ecc_type == 2) {
u8 length = (m->synd >> 18) & 0x3f;

if (length)
err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0);
else
err.err_code = ERR_CHANNEL;
}

err.csrow = m->synd & 0x7;

log_error:
__log_ecc_error(mci, &err, ecc_type);
}

/*
* Use pvt->F3 which contains the F3 CPU PCI device to get the related
* F1 (AddrMap) and F2 (Dct) devices. Return negative value on error.
Expand Down Expand Up @@ -3232,7 +3308,10 @@ static int init_one_instance(unsigned int nid)
if (report_gart_errors)
amd_report_gart_errors(true);

amd_register_ecc_decoder(decode_bus_error);
if (pvt->umc)
amd_register_ecc_decoder(decode_umc_error);
else
amd_register_ecc_decoder(decode_bus_error);

return 0;

Expand Down Expand Up @@ -3323,7 +3402,11 @@ static void remove_one_instance(unsigned int nid)

/* unregister from EDAC MCE */
amd_report_gart_errors(false);
amd_unregister_ecc_decoder(decode_bus_error);

if (pvt->umc)
amd_unregister_ecc_decoder(decode_umc_error);
else
amd_unregister_ecc_decoder(decode_bus_error);

kfree(ecc_stngs[nid]);
ecc_stngs[nid] = NULL;
Expand Down
2 changes: 2 additions & 0 deletions drivers/edac/amd64_edac.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ enum err_codes {
ERR_NODE = -1,
ERR_CSROW = -2,
ERR_CHANNEL = -3,
ERR_SYND = -4,
ERR_NORM_ADDR = -5,
};

struct err_info {
Expand Down

0 comments on commit 713ad54

Please sign in to comment.