Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b2f9d678 authored by Tony Luck's avatar Tony Luck Committed by Ingo Molnar
Browse files

x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries



Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
Reviewed-by: default avatarBorislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 548acf19
Loading
Loading
Loading
Loading
+20 −2
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <asm/mce.h>
#include <asm/uaccess.h>

#include "mce-internal.h"

@@ -29,7 +30,7 @@
 * panic situations)
 */

enum context { IN_KERNEL = 1, IN_USER = 2 };
enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };

@@ -48,6 +49,7 @@ static struct severity {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define  KERNEL		.context = IN_KERNEL
#define  USER		.context = IN_USER
#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
#define  SER		.ser = SER_REQUIRED
#define  NOSER		.ser = NO_SER
#define  EXCP		.excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
		PANIC, "In kernel and no restart IP",
		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
		),
	MCESEV(
		PANIC, "In kernel and no restart IP",
		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
		),
	MCESEV(
		DEFERRED, "Deferred error",
		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
		),
	MCESEV(
		AR, "Action required: data load in error recoverable area of kernel",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
		KERNEL_RECOV
		),
	MCESEV(
		AR, "Action required: data load error in a user process",
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
		)	/* always matches. keep at end */
};

#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))

/*
 * If mcgstatus indicated that ip/cs on the stack were
 * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
 */
static int error_context(struct mce *m)
{
	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
	if ((m->cs & 3) == 3)
		return IN_USER;
	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
		return IN_KERNEL_RECOV;
	return IN_KERNEL;
}

/*
+36 −34
Original line number Diff line number Diff line
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
	}
}

static int do_memory_failure(struct mce *m)
{
	int flags = MF_ACTION_REQUIRED;
	int ret;

	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
	if (!(m->mcgstatus & MCG_STATUS_RIPV))
		flags |= MF_MUST_KILL;
	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
	if (ret)
		pr_err("Memory error not recovered");
	return ret;
}

/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
	char *msg = "Unknown";
	u64 recover_paddr = ~0ull;
	int flags = MF_ACTION_REQUIRED;
	int lmce = 0;

	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
	}

	/*
	 * At insane "tolerant" levels we take no action. Otherwise
	 * we only die if we have no other choice. For less serious
	 * issues we try to recover, or limit damage to the current
	 * process.
	 * If tolerant is at an insane level we drop requests to kill
	 * processes and continue even when there is no way out.
	 */
	if (cfg->tolerant < 3) {
		if (no_way_out)
	if (cfg->tolerant == 3)
		kill_it = 0;
	else if (no_way_out)
		mce_panic("Fatal machine check on current CPU", &m, msg);
		if (worst == MCE_AR_SEVERITY) {
			recover_paddr = m.addr;
			if (!(m.mcgstatus & MCG_STATUS_RIPV))
				flags |= MF_MUST_KILL;
		} else if (kill_it) {
			force_sig(SIGBUS, current);
		}
	}

	if (worst > 0)
		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
out:
	sync_core();

	if (recover_paddr == ~0ull)
		goto done;
	if (worst != MCE_AR_SEVERITY && !kill_it)
		goto out_ist;

	pr_err("Uncorrected hardware memory error in user-access at %llx",
		 recover_paddr);
	/*
	 * We must call memory_failure() here even if the current process is
	 * doomed. We still need to mark the page as poisoned and alert any
	 * other users of the page.
	 */
	/* Fault was in user mode and we need to take some action */
	if ((m.cs & 3) == 3) {
		ist_begin_non_atomic(regs);
		local_irq_enable();
	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
		pr_err("Memory error not recovered");

		if (kill_it || do_memory_failure(&m))
			force_sig(SIGBUS, current);
	}
		local_irq_disable();
		ist_end_non_atomic();
done:
	} else {
		if (!fixup_exception(regs, X86_TRAP_MC))
			mce_panic("Failed kernel mode recovery", &m, NULL);
	}

out_ist:
	ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);