Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 011d8261 authored by Borislav Petkov's avatar Borislav Petkov Committed by Ingo Molnar
Browse files

RAS: Add a Corrected Errors Collector



Introduce a simple data structure for collecting correctable errors
along with accessors. More detailed description in the code itself.

The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
basically the main reason for the CE collector - or to continue running
the notifiers.

When the CEC hits the action threshold, it will try to soft-offine the
page containing the ECC and then the whole decoding chain gets to see
the error.

Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170327093304.10683-5-bp@alien8.de


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e64edfcc
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -3172,6 +3172,12 @@
	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
			See Documentation/blockdev/ramdisk.txt.

	ras=option[,option,...]	[KNL] RAS-specific options

		cec_disable	[X86]
				Disable the Correctable Errors Collector,
				see CONFIG_RAS_CEC help text.

	rcu_nocbs=	[KNL]
			The argument is a cpu list, as described above.

+5 −4
Original line number Diff line number Diff line
@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;

enum mce_notifier_prios {
	MCE_PRIO_SRAO		= INT_MAX,
	MCE_PRIO_EXTLOG		= INT_MAX - 1,
	MCE_PRIO_NFIT		= INT_MAX - 2,
	MCE_PRIO_EDAC		= INT_MAX - 3,
	MCE_PRIO_FIRST		= INT_MAX,
	MCE_PRIO_SRAO		= INT_MAX - 1,
	MCE_PRIO_EXTLOG		= INT_MAX - 2,
	MCE_PRIO_NFIT		= INT_MAX - 3,
	MCE_PRIO_EDAC		= INT_MAX - 4,
	MCE_PRIO_LOWEST		= 0,
};

+115 −76
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {

void mce_log(struct mce *m)
{
	unsigned next, entry;

	/* Emit the trace record: */
	trace_mce_record(m);

	if (!mce_gen_pool_add(m))
		irq_work_queue(&mce_irq_work);

	wmb();
	for (;;) {
		entry = mce_log_get_idx_check(mcelog_buf.next);
		for (;;) {

			/*
			 * When the buffer fills up discard new entries.
			 * Assume that the earlier errors are the more
			 * interesting ones:
			 */
			if (entry >= MCE_LOG_LEN) {
				set_bit(MCE_OVERFLOW,
					(unsigned long *)&mcelog_buf.flags);
				return;
			}
			/* Old left over entry. Skip: */
			if (mcelog_buf.entry[entry].finished) {
				entry++;
				continue;
			}
			break;
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
	wmb();
	mcelog_buf.entry[entry].finished = 1;
	wmb();

	set_bit(0, &mce_need_notify);
}

void mce_inject_log(struct mce *m)
@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);

static struct notifier_block mce_srao_nb;

/*
 * We run the default notifier if we have only the SRAO, the first and the
 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
 * notifiers registered on the chain.
 */
#define NUM_DEFAULT_NOTIFIERS	3
static atomic_t num_notifiers;

void mce_register_decode_chain(struct notifier_block *nb)
@@ -522,7 +490,6 @@ static void mce_schedule_work(void)

static void mce_irq_work_cb(struct irq_work *entry)
{
	mce_notify_irq();
	mce_schedule_work();
}

@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
	return 1;
}

static bool memory_error(struct mce *m)
{
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (c->x86_vendor == X86_VENDOR_AMD) {
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;

		return (xec == 0x0 || xec == 0x8);
	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		 *
		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		 * indicating a memory error. Bit 8 is used for indicating a
		 * cache hierarchy error. The combination of bit 2 and bit 3
		 * is used for indicating a `generic' cache hierarchy error
		 * But we can't just blindly check the above bits, because if
		 * bit 11 is set, then it is a bus/interconnect error - and
		 * either way the above bits just gives more detail on what
		 * bus/interconnect error happened. Note that bit 12 can be
		 * ignored, as it's the "filter" bit.
		 */
		return (m->status & 0xef80) == BIT(7) ||
		       (m->status & 0xef00) == BIT(8) ||
		       (m->status & 0xeffc) == 0xc;
	}

	return false;
}

static bool cec_add_mce(struct mce *m)
{
	if (!m)
		return false;

	/* We eat only correctable DRAM errors with usable addresses. */
	if (memory_error(m) &&
	    !(m->status & MCI_STATUS_UC) &&
	    mce_usable_address(m))
		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
			return true;

	return false;
}

static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
			      void *data)
{
	struct mce *m = (struct mce *)data;
	unsigned int next, entry;

	if (!m)
		return NOTIFY_DONE;

	if (cec_add_mce(m))
		return NOTIFY_STOP;

	/* Emit the trace record: */
	trace_mce_record(m);

	wmb();
	for (;;) {
		entry = mce_log_get_idx_check(mcelog_buf.next);
		for (;;) {

			/*
			 * When the buffer fills up discard new entries.
			 * Assume that the earlier errors are the more
			 * interesting ones:
			 */
			if (entry >= MCE_LOG_LEN) {
				set_bit(MCE_OVERFLOW,
					(unsigned long *)&mcelog_buf.flags);
				return NOTIFY_DONE;
			}
			/* Old left over entry. Skip: */
			if (mcelog_buf.entry[entry].finished) {
				entry++;
				continue;
			}
			break;
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
	wmb();
	mcelog_buf.entry[entry].finished = 1;
	wmb();

	set_bit(0, &mce_need_notify);

	mce_notify_irq();

	return NOTIFY_DONE;
}

static struct notifier_block first_nb = {
	.notifier_call	= mce_first_notifier,
	.priority	= MCE_PRIO_FIRST,
};

static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{
@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
	if (!m)
		return NOTIFY_DONE;

	/*
	 * Run the default notifier if we have only the SRAO
	 * notifier and us registered.
	 */
	if (atomic_read(&num_notifiers) > 2)
	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
		return NOTIFY_DONE;

	/* Don't print when mcelog is running */
@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
	}
}

static bool memory_error(struct mce *m)
{
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (c->x86_vendor == X86_VENDOR_AMD) {
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;

		return (xec == 0x0 || xec == 0x8);
	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		 *
		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		 * indicating a memory error. Bit 8 is used for indicating a
		 * cache hierarchy error. The combination of bit 2 and bit 3
		 * is used for indicating a `generic' cache hierarchy error
		 * But we can't just blindly check the above bits, because if
		 * bit 11 is set, then it is a bus/interconnect error - and
		 * either way the above bits just gives more detail on what
		 * bus/interconnect error happened. Note that bit 12 can be
		 * ignored, as it's the "filter" bit.
		 */
		return (m->status & 0xef80) == BIT(7) ||
		       (m->status & 0xef00) == BIT(8) ||
		       (m->status & 0xeffc) == 0xc;
	}

	return false;
}

DEFINE_PER_CPU(unsigned, mce_poll_count);

/*
@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
	mcheck_intel_therm_init();
	mce_register_decode_chain(&first_nb);
	mce_register_decode_chain(&mce_srao_nb);
	mce_register_decode_chain(&mce_default_nb);
	mcheck_vendor_init_severity();
@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
		static_branch_inc(&mcsafe_key);

	mcheck_debugfs_init();
	cec_init();

	/*
	 * Flush out everything that has been logged during early boot, now that
+14 −0
Original line number Diff line number Diff line
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
	  aspects of the MCE handling code.

	  WARNING: Do not even assume this interface is staying stable!

config RAS_CEC
	bool "Correctable Errors Collector"
	depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
	---help---
	  This is a small cache which collects correctable memory errors per 4K
	  page PFN and counts their repeated occurrence. Once the counter for a
	  PFN overflows, we try to soft-offline that page as we take it to mean
	  that it has reached a relatively high error count and would probably
	  be best if we don't use it anymore.

	  Bear in mind that this is absolutely useless if your platform doesn't
	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+2 −1
Original line number Diff line number Diff line
obj-$(CONFIG_RAS)	+= ras.o debugfs.o
obj-$(CONFIG_RAS_CEC)	+= cec.o
Loading