Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7cfd4a87 authored by Borislav Petkov's avatar Borislav Petkov Committed by Borislav Petkov
Browse files

EDAC, MCE: Pass complete MCE info to decoders



... instead of the MCi_STATUS info only for improved handling of certain
types of errors later.

Signed-off-by: default avatarBorislav Petkov <borislav.petkov@amd.com>
parent 6337583d
Loading
Loading
Loading
Loading
+10 −3
Original line number Original line Diff line number Diff line
@@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
		amd64_handle_ue(mci, info);
		amd64_handle_ue(mci, info);
}
}


void amd64_decode_bus_error(int node_id, struct err_regs *regs)
void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
{
{
	struct mem_ctl_info *mci = mci_lookup[node_id];
	struct mem_ctl_info *mci = mci_lookup[node_id];
	struct err_regs regs;


	__amd64_decode_bus_error(mci, regs);
	regs.nbsl  = (u32) m->status;
	regs.nbsh  = (u32)(m->status >> 32);
	regs.nbeal = (u32) m->addr;
	regs.nbeah = (u32)(m->addr >> 32);
	regs.nbcfg = nbcfg;

	__amd64_decode_bus_error(mci, &regs);


	/*
	/*
	 * Check the UE bit of the NB status high register, if set generate some
	 * Check the UE bit of the NB status high register, if set generate some
@@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
	 *
	 *
	 * FIXME: this should go somewhere else, if at all.
	 * FIXME: this should go somewhere else, if at all.
	 */
	 */
	if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
	if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
		edac_mc_handle_ue_no_info(mci, "UE bit is set");
		edac_mc_handle_ue_no_info(mci, "UE bit is set");


}
}
+8 −2
Original line number Original line Diff line number Diff line
@@ -10,11 +10,14 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
				size_t count)
				size_t count)
{
{
	struct amd64_pvt *pvt = mci->pvt_info;
	struct amd64_pvt *pvt = mci->pvt_info;
	unsigned long long value;
	u64 value;
	int ret = 0;
	int ret = 0;
	struct mce m;


	ret = strict_strtoull(data, 16, &value);
	ret = strict_strtoull(data, 16, &value);
	if (ret != -EINVAL) {
	if (ret != -EINVAL) {
		struct err_regs *regs = &pvt->ctl_error_info;

		debugf0("received NBEA= 0x%llx\n", value);
		debugf0("received NBEA= 0x%llx\n", value);


		/* place the value into the virtual error packet */
		/* place the value into the virtual error packet */
@@ -22,9 +25,12 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
		value >>= 32;
		value >>= 32;
		pvt->ctl_error_info.nbeah = (u32) value;
		pvt->ctl_error_info.nbeah = (u32) value;


		m.addr   = value;
		m.status = regs->nbsl | ((u64)regs->nbsh << 32);

		/* Process the Mapping request */
		/* Process the Mapping request */
		/* TODO: Add race prevention */
		/* TODO: Add race prevention */
		amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info);
		amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg);


		return count;
		return count;
	}
	}
+35 −39
Original line number Original line Diff line number Diff line
@@ -2,7 +2,7 @@
#include "edac_mce_amd.h"
#include "edac_mce_amd.h"


static bool report_gart_errors;
static bool report_gart_errors;
static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);


void amd_report_gart_errors(bool v)
void amd_report_gart_errors(bool v)
{
{
@@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v)
}
}
EXPORT_SYMBOL_GPL(amd_report_gart_errors);
EXPORT_SYMBOL_GPL(amd_report_gart_errors);


void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
{
{
	nb_bus_decoder = f;
	nb_bus_decoder = f;
}
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);


void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
{
{
	if (nb_bus_decoder) {
	if (nb_bus_decoder) {
		WARN_ON(nb_bus_decoder != f);
		WARN_ON(nb_bus_decoder != f);
@@ -97,17 +97,17 @@ const char *ext_msgs[] = {
};
};
EXPORT_SYMBOL_GPL(ext_msgs);
EXPORT_SYMBOL_GPL(ext_msgs);


static void amd_decode_dc_mce(u64 mc0_status)
static void amd_decode_dc_mce(struct mce *m)
{
{
	u32 ec  = mc0_status & 0xffff;
	u32 ec  = m->status & 0xffff;
	u32 xec = (mc0_status >> 16) & 0xf;
	u32 xec = (m->status >> 16) & 0xf;


	pr_emerg(HW_ERR "Data Cache Error: ");
	pr_emerg(HW_ERR "Data Cache Error: ");


	if (xec == 1 && TLB_ERROR(ec))
	if (xec == 1 && TLB_ERROR(ec))
		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
	else if (xec == 0) {
	else if (xec == 0) {
		if (mc0_status & (1ULL << 40))
		if (m->status & (1ULL << 40))
			pr_cont(" during Data Scrub.\n");
			pr_cont(" during Data Scrub.\n");
		else if (TLB_ERROR(ec))
		else if (TLB_ERROR(ec))
			pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
			pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
@@ -140,10 +140,10 @@ static void amd_decode_dc_mce(u64 mc0_status)
	pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
	pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
}
}


static void amd_decode_ic_mce(u64 mc1_status)
static void amd_decode_ic_mce(struct mce *m)
{
{
	u32 ec  = mc1_status & 0xffff;
	u32 ec  = m->status & 0xffff;
	u32 xec = (mc1_status >> 16) & 0xf;
	u32 xec = (m->status >> 16) & 0xf;


	pr_emerg(HW_ERR "Instruction Cache Error");
	pr_emerg(HW_ERR "Instruction Cache Error");


@@ -154,7 +154,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
			pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
			pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
		else if (BUS_ERROR(ec)) {
		else if (BUS_ERROR(ec)) {
			if (boot_cpu_data.x86 == 0xf &&
			if (boot_cpu_data.x86 == 0xf &&
			    (mc1_status & (1ULL << 58)))
			    (m->status & BIT(58)))
				pr_cont(" during system linefill.\n");
				pr_cont(" during system linefill.\n");
			else
			else
				pr_cont(" during attempted NB data read.\n");
				pr_cont(" during attempted NB data read.\n");
@@ -197,10 +197,10 @@ static void amd_decode_ic_mce(u64 mc1_status)
	pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
	pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
}
}


static void amd_decode_bu_mce(u64 mc2_status)
static void amd_decode_bu_mce(struct mce *m)
{
{
	u32 ec = mc2_status & 0xffff;
	u32 ec = m->status & 0xffff;
	u32 xec = (mc2_status >> 16) & 0xf;
	u32 xec = (m->status >> 16) & 0xf;


	pr_emerg(HW_ERR "Bus Unit Error");
	pr_emerg(HW_ERR "Bus Unit Error");


@@ -239,10 +239,10 @@ static void amd_decode_bu_mce(u64 mc2_status)
	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
}
}


static void amd_decode_ls_mce(u64 mc3_status)
static void amd_decode_ls_mce(struct mce *m)
{
{
	u32 ec  = mc3_status & 0xffff;
	u32 ec  = m->status & 0xffff;
	u32 xec = (mc3_status >> 16) & 0xf;
	u32 xec = (m->status >> 16) & 0xf;


	pr_emerg(HW_ERR "Load Store Error");
	pr_emerg(HW_ERR "Load Store Error");


@@ -260,9 +260,11 @@ static void amd_decode_ls_mce(u64 mc3_status)
	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
}
}


void amd_decode_nb_mce(int node_id, struct err_regs *regs)
void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
{
{
	u32 ec  = ERROR_CODE(regs->nbsl);
	u32 ec   = m->status & 0xffff;
	u32 nbsh = (u32)(m->status >> 32);
	u32 nbsl = (u32)m->status;


	/*
	/*
	 * GART TLB error reporting is disabled by default. Bail out early.
	 * GART TLB error reporting is disabled by default. Bail out early.
@@ -278,10 +280,10 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs)
	 */
	 */
	if ((boot_cpu_data.x86 == 0x10) &&
	if ((boot_cpu_data.x86 == 0x10) &&
	    (boot_cpu_data.x86_model > 7)) {
	    (boot_cpu_data.x86_model > 7)) {
		if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
		if (nbsh & K8_NBSH_ERR_CPU_VAL)
			pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
			pr_cont(", core: %u\n", (u8)(nbsh & 0xf));
	} else {
	} else {
		u8 assoc_cpus = regs->nbsh & 0xf;
		u8 assoc_cpus = nbsh & 0xf;


		if (assoc_cpus > 0)
		if (assoc_cpus > 0)
			pr_cont(", core: %d", fls(assoc_cpus) - 1);
			pr_cont(", core: %d", fls(assoc_cpus) - 1);
@@ -289,17 +291,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs)
		pr_cont("\n");
		pr_cont("\n");
	}
	}


	pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl));
	pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl));


	if (BUS_ERROR(ec) && nb_bus_decoder)
	if (BUS_ERROR(ec) && nb_bus_decoder)
		nb_bus_decoder(node_id, regs);
		nb_bus_decoder(node_id, m, nbcfg);
}
}
EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
EXPORT_SYMBOL_GPL(amd_decode_nb_mce);


static void amd_decode_fr_mce(u64 mc5_status)
static void amd_decode_fr_mce(struct mce *m)
{
{
	/* we have only one error signature so match all fields at once. */
	/* we have only one error signature so match all fields at once. */
	if ((mc5_status & 0xffff) == 0x0f0f)
	if ((m->status & 0xffff) == 0x0f0f)
		pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
		pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
	else
	else
		pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
		pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
@@ -326,7 +328,6 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
			   void *data)
			   void *data)
{
{
	struct mce *m = (struct mce *)data;
	struct mce *m = (struct mce *)data;
	struct err_regs regs;
	int node, ecc;
	int node, ecc;


	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
@@ -346,33 +347,28 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,


	switch (m->bank) {
	switch (m->bank) {
	case 0:
	case 0:
		amd_decode_dc_mce(m->status);
		amd_decode_dc_mce(m);
		break;
		break;


	case 1:
	case 1:
		amd_decode_ic_mce(m->status);
		amd_decode_ic_mce(m);
		break;
		break;


	case 2:
	case 2:
		amd_decode_bu_mce(m->status);
		amd_decode_bu_mce(m);
		break;
		break;


	case 3:
	case 3:
		amd_decode_ls_mce(m->status);
		amd_decode_ls_mce(m);
		break;
		break;


	case 4:
	case 4:
		regs.nbsl  = (u32) m->status;
		regs.nbsh  = (u32)(m->status >> 32);
		regs.nbeal = (u32) m->addr;
		regs.nbeah = (u32)(m->addr >> 32);
		node = amd_get_nb_id(m->extcpu);
		node = amd_get_nb_id(m->extcpu);

		amd_decode_nb_mce(node, m, 0);
		amd_decode_nb_mce(node, &regs);
		break;
		break;


	case 5:
	case 5:
		amd_decode_fr_mce(m->status);
		amd_decode_fr_mce(m);
		break;
		break;


	default:
	default:
+3 −3
Original line number Original line Diff line number Diff line
@@ -63,8 +63,8 @@ struct err_regs {




void amd_report_gart_errors(bool);
void amd_report_gart_errors(bool);
void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
void amd_decode_nb_mce(int, struct err_regs *);
void amd_decode_nb_mce(int, struct mce *, u32);


#endif /* _EDAC_MCE_AMD_H */
#endif /* _EDAC_MCE_AMD_H */