Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 37a16046 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Thomas Gleixner:
 "A small set of changes to the RAS core:

   - Rework of the MCE bank scanning code

   - Y2038 converion"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Cleanup __mc_scan_banks()
  x86/mce: Carve out bank scanning code
  x86/mce: Remove !banks check
  x86/mce: Carve out the crashing_cpu check
  x86/mce: Always use 64-bit timestamps
parents b99cdfdf d5c84ef2
Loading
Loading
Loading
Loading
+107 −95
Original line number Diff line number Diff line
@@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
{
	memset(m, 0, sizeof(struct mce));
	m->cpu = m->extcpu = smp_processor_id();
	/* We hope get_seconds stays lockless */
	m->time = get_seconds();
	/* need the internal __ version to avoid deadlocks */
	m->time = __ktime_get_real_seconds();
	m->cpuvendor = boot_cpu_data.x86_vendor;
	m->cpuid = cpuid_eax(1);
	m->socketid = cpu_data(m->extcpu).phys_proc_id;
@@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn)
}
#endif


/*
 * Cases where we avoid rendezvous handler timeout:
 * 1) If this CPU is offline.
 *
 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
 *  skip those CPUs which remain looping in the 1st kernel - see
 *  crash_nmi_callback().
 *
 * Note: there still is a small window between kexec-ing and the new,
 * kdump kernel establishing a new #MC handler where a broadcasted MCE
 * might not get handled properly.
 */
static bool __mc_check_crashing_cpu(int cpu)
{
	if (cpu_is_offline(cpu) ||
	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
		if (mcgstatus & MCG_STATUS_RIPV) {
			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
			return true;
		}
	}
	return false;
}

static void __mc_scan_banks(struct mce *m, struct mce *final,
			    unsigned long *toclear, unsigned long *valid_banks,
			    int no_way_out, int *worst)
{
	struct mca_config *cfg = &mca_cfg;
	int severity, i;

	for (i = 0; i < cfg->banks; i++) {
		__clear_bit(i, toclear);
		if (!test_bit(i, valid_banks))
			continue;

		if (!mce_banks[i].ctl)
			continue;

		m->misc = 0;
		m->addr = 0;
		m->bank = i;

		m->status = mce_rdmsrl(msr_ops.status(i));
		if (!(m->status & MCI_STATUS_VAL))
			continue;

		/*
		 * Corrected or non-signaled errors are handled by
		 * machine_check_poll(). Leave them alone, unless this panics.
		 */
		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
			!no_way_out)
			continue;

		/* Set taint even when machine check was not enabled. */
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

		severity = mce_severity(m, cfg->tolerant, NULL, true);

		/*
		 * When machine check was for corrected/deferred handler don't
		 * touch, unless we're panicking.
		 */
		if ((severity == MCE_KEEP_SEVERITY ||
		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
			continue;

		__set_bit(i, toclear);

		/* Machine check event was not enabled. Clear, but ignore. */
		if (severity == MCE_NO_SEVERITY)
			continue;

		mce_read_aux(m, i);

		/* assuming valid severity level != 0 */
		m->severity = severity;

		mce_log(m);

		if (severity > *worst) {
			*final = *m;
			*worst = severity;
		}
	}

	/* mce_clear_state will clear *final, save locally for use later */
	*m = *final;
}

/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
@@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn)
 */
void do_machine_check(struct pt_regs *regs, long error_code)
{
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
	struct mca_config *cfg = &mca_cfg;
	int cpu = smp_processor_id();
	char *msg = "Unknown";
	struct mce m, *final;
	int i;
	int worst = 0;
	int severity;

	/*
	 * Establish sequential order between the CPUs entering the machine
	 * check handler.
	 */
	int order = -1;

	/*
	 * If no_way_out gets set, there is no safe way to recover from this
	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
	 */
	int no_way_out = 0;

	/*
	 * If kill_it gets set, there might be a way to recover from this
	 * error.
	 */
	int kill_it = 0;
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
	char *msg = "Unknown";

	/*
	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
	 * on Intel.
	 */
	int lmce = 1;
	int cpu = smp_processor_id();

	/*
	 * Cases where we avoid rendezvous handler timeout:
	 * 1) If this CPU is offline.
	 *
	 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
	 *  skip those CPUs which remain looping in the 1st kernel - see
	 *  crash_nmi_callback().
	 *
	 * Note: there still is a small window between kexec-ing and the new,
	 * kdump kernel establishing a new #MC handler where a broadcasted MCE
	 * might not get handled properly.
	 */
	if (cpu_is_offline(cpu) ||
	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
		if (mcgstatus & MCG_STATUS_RIPV) {
			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
	if (__mc_check_crashing_cpu(cpu))
		return;
		}
	}

	ist_enter(regs);

	this_cpu_inc(mce_exception_count);

	if (!cfg->banks)
		goto out;

	mce_gather_info(&m, regs);
	m.tsc = rdtsc();

@@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
		order = mce_start(&no_way_out);
	}

	for (i = 0; i < cfg->banks; i++) {
		__clear_bit(i, toclear);
		if (!test_bit(i, valid_banks))
			continue;
		if (!mce_banks[i].ctl)
			continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;

		m.status = mce_rdmsrl(msr_ops.status(i));
		if ((m.status & MCI_STATUS_VAL) == 0)
			continue;

		/*
		 * Non uncorrected or non signaled errors are handled by
		 * machine_check_poll. Leave them alone, unless this panics.
		 */
		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
			!no_way_out)
			continue;

		/*
		 * Set taint even when machine check was not enabled.
		 */
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

		severity = mce_severity(&m, cfg->tolerant, NULL, true);

		/*
		 * When machine check was for corrected/deferred handler don't
		 * touch, unless we're panicing.
		 */
		if ((severity == MCE_KEEP_SEVERITY ||
		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
			continue;
		__set_bit(i, toclear);
		if (severity == MCE_NO_SEVERITY) {
			/*
			 * Machine check event was not enabled. Clear, but
			 * ignore.
			 */
			continue;
		}

		mce_read_aux(&m, i);

		/* assuming valid severity level != 0 */
		m.severity = severity;

		mce_log(&m);

		if (severity > worst) {
			*final = m;
			worst = severity;
		}
	}

	/* mce_clear_state will clear *final, save locally for use later */
	m = *final;
	__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);

	if (!no_way_out)
		mce_clear_state(toclear);
@@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
	if (worst > 0)
		mce_report_event(regs);
	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:

	sync_core();

	if (worst != MCE_AR_SEVERITY && !kill_it)