Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a1c75e17 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Ingo Molnar:

 - various AMD SMCA error parsing/reporting improvements (Yazen Ghannam)

 - extend Intel CMCI error reporting to more cases (Xie XiuQi)

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/MCE: Make correctable error detection look at the Deferred bit
  x86/MCE: Report only DRAM ECC as memory errors on AMD systems
  x86/MCE/AMD: Define a function to get SMCA bank type
  x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems
  x86/MCE: Extend table to report action optional errors through CMCI too
parents d8b91dde 179eb850
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -376,6 +376,7 @@ struct smca_bank {
extern struct smca_bank smca_banks[MAX_NR_BANKS];

extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);

extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
@@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };

#endif

+17 −9
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@ static struct severity {
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
#define  MASK(x, y)	.mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)

@@ -101,6 +102,22 @@ static struct severity {
		NOSER, BITCLR(MCI_STATUS_UC)
		),

	/*
	 * known AO MCACODs reported via MCE or CMC:
	 *
	 * SRAO could be signaled either via a machine check exception or
	 * CMCI with the corresponding bit S 1 or 0. So we don't need to
	 * check bit S for SRAO.
	 */
	MCESEV(
		AO, "Action optional: memory scrubbing error",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
		),
	MCESEV(
		AO, "Action optional: last level cache writeback error",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
		),

	/* ignore OVER for UCNA */
	MCESEV(
		UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
		),

	/* known AO MCACODs: */
	MCESEV(
		AO, "Action optional: memory scrubbing error",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
		),
	MCESEV(
		AO, "Action optional: last level cache writeback error",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
		),
	MCESEV(
		SOME, "Action optional: unknown MCACOD",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+13 −4
Original line number Diff line number Diff line
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
bool mce_is_memory_error(struct mce *m)
{
	if (m->cpuvendor == X86_VENDOR_AMD) {
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;
		return amd_mce_is_memory_error(m);

		return (xec == 0x0 || xec == 0x8);
	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

static bool mce_is_correctable(struct mce *m)
{
	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
		return false;

	if (m->status & MCI_STATUS_UC)
		return false;

	return true;
}

static bool cec_add_mce(struct mce *m)
{
	if (!m)
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)

	/* We eat only correctable DRAM errors with usable addresses. */
	if (mce_is_memory_error(m) &&
	    !(m->status & MCI_STATUS_UC) &&
	    mce_is_correctable(m)  &&
	    mce_usable_address(m))
		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
			return true;
+28 −1
Original line number Diff line number Diff line
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);

static enum smca_bank_types smca_get_bank_type(struct mce *m)
{
	struct smca_bank *b;

	if (m->bank >= N_SMCA_BANK_TYPES)
		return N_SMCA_BANK_TYPES;

	b = &smca_banks[m->bank];
	if (!b->hwid)
		return N_SMCA_BANK_TYPES;

	return b->hwid->bank_type;
}

static struct smca_hwid smca_hwid_mcatypes[] = {
	/* { bank_type, hwid_mcatype, xec_bitmap } */

@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
	    (deferred_error_int_vector != amd_deferred_error_interrupt))
		deferred_error_int_vector = amd_deferred_error_interrupt;

	if (!mce_flags.smca)
		low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;

	wrmsr(MSR_CU_DEF_ERR, low, high);
}

@@ -738,6 +754,17 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);

bool amd_mce_is_memory_error(struct mce *m)
{
	/* ErrCodeExt[20:16] */
	u8 xec = (m->status >> 16) & 0x1f;

	if (mce_flags.smca)
		return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;

	return m->bank == 4 && xec == 0x8;
}

static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
	struct mce m;