RAS: Add a Corrected Errors Collector (011d8261) · Commits · e / devices / android_kernel_fairphone_FP4

Documentation/admin-guide/kernel-parameters.txt

+6 −0

Original line number	Diff line number	Diff line
		@@ -3172,6 +3172,12 @@
		ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
		See Documentation/blockdev/ramdisk.txt.

		ras=option[,option,...] [KNL] RAS-specific options

		cec_disable [X86]
		Disable the Correctable Errors Collector,
		see CONFIG_RAS_CEC help text.

		rcu_nocbs= [KNL]
		The argument is a cpu list, as described above.

arch/x86/include/asm/mce.h

+5 −4

Original line number	Diff line number	Diff line
		@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
		extern struct mca_msr_regs msr_ops;

		enum mce_notifier_prios {
		MCE_PRIO_SRAO = INT_MAX,
		MCE_PRIO_EXTLOG = INT_MAX - 1,
		MCE_PRIO_NFIT = INT_MAX - 2,
		MCE_PRIO_EDAC = INT_MAX - 3,
		MCE_PRIO_FIRST = INT_MAX,
		MCE_PRIO_SRAO = INT_MAX - 1,
		MCE_PRIO_EXTLOG = INT_MAX - 2,
		MCE_PRIO_NFIT = INT_MAX - 3,
		MCE_PRIO_EDAC = INT_MAX - 4,
		MCE_PRIO_LOWEST = 0,
		};

arch/x86/kernel/cpu/mcheck/mce.c

+115 −76

Original line number	Diff line number	Diff line
		@@ -35,6 +35,7 @@
		#include <linux/poll.h>
		#include <linux/nmi.h>
		#include <linux/cpu.h>
		#include <linux/ras.h>
		#include <linux/smp.h>
		#include <linux/fs.h>
		#include <linux/mm.h>
		@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {

		void mce_log(struct mce *m)
		{
		unsigned next, entry;

		/* Emit the trace record: */
		trace_mce_record(m);

		if (!mce_gen_pool_add(m))
		irq_work_queue(&mce_irq_work);

		wmb();
		for (;;) {
		entry = mce_log_get_idx_check(mcelog_buf.next);
		for (;;) {

		/*
		* When the buffer fills up discard new entries.
		* Assume that the earlier errors are the more
		* interesting ones:
		*/
		if (entry >= MCE_LOG_LEN) {
		set_bit(MCE_OVERFLOW,
		(unsigned long *)&mcelog_buf.flags);
		return;
		}
		/* Old left over entry. Skip: */
		if (mcelog_buf.entry[entry].finished) {
		entry++;
		continue;
		}
		break;
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
		break;
		}
		memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
		wmb();
		mcelog_buf.entry[entry].finished = 1;
		wmb();

		set_bit(0, &mce_need_notify);
		}

		void mce_inject_log(struct mce *m)
		@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);

		static struct notifier_block mce_srao_nb;

		/*
		* We run the default notifier if we have only the SRAO, the first and the
		* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
		* notifiers registered on the chain.
		*/
		#define NUM_DEFAULT_NOTIFIERS 3
		static atomic_t num_notifiers;

		void mce_register_decode_chain(struct notifier_block *nb)
		@@ -522,7 +490,6 @@ static void mce_schedule_work(void)

		static void mce_irq_work_cb(struct irq_work *entry)
		{
		mce_notify_irq();
		mce_schedule_work();
		}

		@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
		return 1;
		}

		static bool memory_error(struct mce *m)
		{
		struct cpuinfo_x86 *c = &boot_cpu_data;

		if (c->x86_vendor == X86_VENDOR_AMD) {
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;

		return (xec == 0x0 \|\| xec == 0x8);
		} else if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		*
		* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		* indicating a memory error. Bit 8 is used for indicating a
		* cache hierarchy error. The combination of bit 2 and bit 3
		* is used for indicating a `generic' cache hierarchy error
		* But we can't just blindly check the above bits, because if
		* bit 11 is set, then it is a bus/interconnect error - and
		* either way the above bits just gives more detail on what
		* bus/interconnect error happened. Note that bit 12 can be
		* ignored, as it's the "filter" bit.
		*/
		return (m->status & 0xef80) == BIT(7) \|\|
		(m->status & 0xef00) == BIT(8) \|\|
		(m->status & 0xeffc) == 0xc;
		}

		return false;
		}

		static bool cec_add_mce(struct mce *m)
		{
		if (!m)
		return false;

		/* We eat only correctable DRAM errors with usable addresses. */
		if (memory_error(m) &&
		!(m->status & MCI_STATUS_UC) &&
		mce_usable_address(m))
		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
		return true;

		return false;
		}

		static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
		void *data)
		{
		struct mce m = (struct mce )data;
		unsigned int next, entry;

		if (!m)
		return NOTIFY_DONE;

		if (cec_add_mce(m))
		return NOTIFY_STOP;

		/* Emit the trace record: */
		trace_mce_record(m);

		wmb();
		for (;;) {
		entry = mce_log_get_idx_check(mcelog_buf.next);
		for (;;) {

		/*
		* When the buffer fills up discard new entries.
		* Assume that the earlier errors are the more
		* interesting ones:
		*/
		if (entry >= MCE_LOG_LEN) {
		set_bit(MCE_OVERFLOW,
		(unsigned long *)&mcelog_buf.flags);
		return NOTIFY_DONE;
		}
		/* Old left over entry. Skip: */
		if (mcelog_buf.entry[entry].finished) {
		entry++;
		continue;
		}
		break;
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
		break;
		}
		memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
		wmb();
		mcelog_buf.entry[entry].finished = 1;
		wmb();

		set_bit(0, &mce_need_notify);

		mce_notify_irq();

		return NOTIFY_DONE;
		}

		static struct notifier_block first_nb = {
		.notifier_call = mce_first_notifier,
		.priority = MCE_PRIO_FIRST,
		};

		static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
		void *data)
		{
		@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
		if (!m)
		return NOTIFY_DONE;

		/*
		* Run the default notifier if we have only the SRAO
		* notifier and us registered.
		*/
		if (atomic_read(&num_notifiers) > 2)
		if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
		return NOTIFY_DONE;

		/* Don't print when mcelog is running */
		@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
		}
		}

		static bool memory_error(struct mce *m)
		{
		struct cpuinfo_x86 *c = &boot_cpu_data;

		if (c->x86_vendor == X86_VENDOR_AMD) {
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;

		return (xec == 0x0 \|\| xec == 0x8);
		} else if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		*
		* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		* indicating a memory error. Bit 8 is used for indicating a
		* cache hierarchy error. The combination of bit 2 and bit 3
		* is used for indicating a `generic' cache hierarchy error
		* But we can't just blindly check the above bits, because if
		* bit 11 is set, then it is a bus/interconnect error - and
		* either way the above bits just gives more detail on what
		* bus/interconnect error happened. Note that bit 12 can be
		* ignored, as it's the "filter" bit.
		*/
		return (m->status & 0xef80) == BIT(7) \|\|
		(m->status & 0xef00) == BIT(8) \|\|
		(m->status & 0xeffc) == 0xc;
		}

		return false;
		}

		DEFINE_PER_CPU(unsigned, mce_poll_count);

		/*
		@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
		int __init mcheck_init(void)
		{
		mcheck_intel_therm_init();
		mce_register_decode_chain(&first_nb);
		mce_register_decode_chain(&mce_srao_nb);
		mce_register_decode_chain(&mce_default_nb);
		mcheck_vendor_init_severity();
		@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
		static_branch_inc(&mcsafe_key);

		mcheck_debugfs_init();
		cec_init();

		/*
		* Flush out everything that has been logged during early boot, now that

arch/x86/ras/Kconfig

+14 −0

Original line number	Diff line number	Diff line
		@@ -7,3 +7,17 @@ config MCE_AMD_INJ
		aspects of the MCE handling code.

		WARNING: Do not even assume this interface is staying stable!

		config RAS_CEC
		bool "Correctable Errors Collector"
		depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
		---help---
		This is a small cache which collects correctable memory errors per 4K
		page PFN and counts their repeated occurrence. Once the counter for a
		PFN overflows, we try to soft-offline that page as we take it to mean
		that it has reached a relatively high error count and would probably
		be best if we don't use it anymore.

		Bear in mind that this is absolutely useless if your platform doesn't
		have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.

drivers/ras/Makefile

+2 −1

Original line number	Diff line number	Diff line
		obj-$(CONFIG_RAS) += ras.o debugfs.o
		obj-$(CONFIG_RAS_CEC) += cec.o