Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f5c8a104 authored by Ingo Molnar's avatar Ingo Molnar
Browse files

Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras



Pull RAS update from Borislav Petkov:

  "This has been long in the making - an AMD-specific MCE-severity grading
   function. And it is actually readable at a quick glance. Further error
   recovery actions will be based on its output.

   Patches tested on every relevant AMD family out there."

Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents c9ce8712 43eaa2a1
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -116,6 +116,12 @@ struct mca_config {
	u32 rip_msr;
};

struct mce_vendor_flags {
	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */
			__reserved_0	: 63;
};
extern struct mce_vendor_flags mce_flags;

extern struct mca_config mca_cfg;
extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_vendor_init_severity(void);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_vendor_init_severity(void) {}
#endif

#ifdef CONFIG_X86_ANCIENT_MCE
+1 −1
Original line number Diff line number Diff line
@@ -24,7 +24,7 @@ struct mce_bank {
	char			attrname[ATTR_LEN];	/* attribute name */
};

int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);

extern struct mce_bank *mce_banks;
+66 −1
Original line number Diff line number Diff line
@@ -186,7 +186,62 @@ static int error_context(struct mce *m)
	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}

int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
/*
 * See AMD Error Scope Hierarchy table in a newer BKDG. For example
 * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
 */
static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
{
	enum context ctx = error_context(m);

	/* Processor Context Corrupt, no need to fumble too much, die! */
	if (m->status & MCI_STATUS_PCC)
		return MCE_PANIC_SEVERITY;

	if (m->status & MCI_STATUS_UC) {

		/*
		 * On older systems where overflow_recov flag is not present, we
		 * should simply panic if an error overflow occurs. If
		 * overflow_recov flag is present and set, then software can try
		 * to at least kill process to prolong system operation.
		 */
		if (mce_flags.overflow_recov) {
			/* software can try to contain */
			if (!(m->mcgstatus & MCG_STATUS_RIPV))
				if (ctx == IN_KERNEL)
					return MCE_PANIC_SEVERITY;

				/* kill current process */
				return MCE_AR_SEVERITY;
		} else {
			/* at least one error was not logged */
			if (m->status & MCI_STATUS_OVER)
				return MCE_PANIC_SEVERITY;
		}

		/*
		 * For any other case, return MCE_UC_SEVERITY so that we log the
		 * error and exit #MC handler.
		 */
		return MCE_UC_SEVERITY;
	}

	/*
	 * deferred error: poll handler catches these and adds to mce_ring so
	 * memory-failure can take recovery actions.
	 */
	if (m->status & MCI_STATUS_DEFERRED)
		return MCE_DEFERRED_SEVERITY;

	/*
	 * corrected error: poll handler catches these and passes responsibility
	 * of decoding the error to EDAC
	 */
	return MCE_KEEP_SEVERITY;
}

static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
	enum context ctx = error_context(m);
@@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
	}
}

/* Default to mce_severity_intel */
int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
		    mce_severity_intel;

void __init mcheck_vendor_init_severity(void)
{
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
		mce_severity = mce_severity_amd;
}

#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
+10 −0
Original line number Diff line number Diff line
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
DEFINE_PER_CPU(unsigned, mce_exception_count);

struct mce_bank *mce_banks __read_mostly;
struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = {
	.bootlog  = -1,
@@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
		if (c->x86 == 6 && cfg->banks > 0)
			mce_banks[0].ctl = 0;

		/*
		 * overflow_recov is supported for F15h Models 00h-0fh
		 * even though we don't have a CPUID bit for it.
		 */
		if (c->x86 == 0x15 && c->x86_model <= 0xf)
			mce_flags.overflow_recov = 1;

		/*
		 * Turn off MC4_MISC thresholding banks on those models since
		 * they're not supported there.
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
		break;
	case X86_VENDOR_AMD:
		mce_amd_feature_init(c);
		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
		break;
	default:
		break;
@@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
	mcheck_intel_therm_init();
	mcheck_vendor_init_severity();

	return 0;
}