Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (37a16046) · Commits · e / devices / android_kernel_teracube_emerald

arch/x86/kernel/cpu/mcheck/mce.c

+107 −95

Original line number	Diff line number	Diff line
		@@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
		{
		memset(m, 0, sizeof(struct mce));
		m->cpu = m->extcpu = smp_processor_id();
		/* We hope get_seconds stays lockless */
		m->time = get_seconds();
		/* need the internal __ version to avoid deadlocks */
		m->time = __ktime_get_real_seconds();
		m->cpuvendor = boot_cpu_data.x86_vendor;
		m->cpuid = cpuid_eax(1);
		m->socketid = cpu_data(m->extcpu).phys_proc_id;
		@@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn)
		}
		#endif


		/*
		* Cases where we avoid rendezvous handler timeout:
		* 1) If this CPU is offline.
		*
		* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
		* skip those CPUs which remain looping in the 1st kernel - see
		* crash_nmi_callback().
		*
		* Note: there still is a small window between kexec-ing and the new,
		* kdump kernel establishing a new #MC handler where a broadcasted MCE
		* might not get handled properly.
		*/
		static bool __mc_check_crashing_cpu(int cpu)
		{
		if (cpu_is_offline(cpu) \|\|
		(crashing_cpu != -1 && crashing_cpu != cpu)) {
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
		if (mcgstatus & MCG_STATUS_RIPV) {
		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
		return true;
		}
		}
		return false;
		}

		static void __mc_scan_banks(struct mce m, struct mce final,
		unsigned long toclear, unsigned long valid_banks,
		int no_way_out, int *worst)
		{
		struct mca_config *cfg = &mca_cfg;
		int severity, i;

		for (i = 0; i < cfg->banks; i++) {
		__clear_bit(i, toclear);
		if (!test_bit(i, valid_banks))
		continue;

		if (!mce_banks[i].ctl)
		continue;

		m->misc = 0;
		m->addr = 0;
		m->bank = i;

		m->status = mce_rdmsrl(msr_ops.status(i));
		if (!(m->status & MCI_STATUS_VAL))
		continue;

		/*
		* Corrected or non-signaled errors are handled by
		* machine_check_poll(). Leave them alone, unless this panics.
		*/
		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
		!no_way_out)
		continue;

		/* Set taint even when machine check was not enabled. */
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

		severity = mce_severity(m, cfg->tolerant, NULL, true);

		/*
		* When machine check was for corrected/deferred handler don't
		* touch, unless we're panicking.
		*/
		if ((severity == MCE_KEEP_SEVERITY \|\|
		severity == MCE_UCNA_SEVERITY) && !no_way_out)
		continue;

		__set_bit(i, toclear);

		/* Machine check event was not enabled. Clear, but ignore. */
		if (severity == MCE_NO_SEVERITY)
		continue;

		mce_read_aux(m, i);

		/* assuming valid severity level != 0 */
		m->severity = severity;

		mce_log(m);

		if (severity > *worst) {
		final = m;
		*worst = severity;
		}
		}

		/* mce_clear_state will clear final, save locally for use later /
		m = final;
		}

		/*
		* The actual machine check handler. This only handles real
		* exceptions when something got corrupted coming in through int 18.
		@@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn)
		*/
		void do_machine_check(struct pt_regs *regs, long error_code)
		{
		DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
		DECLARE_BITMAP(toclear, MAX_NR_BANKS);
		struct mca_config *cfg = &mca_cfg;
		int cpu = smp_processor_id();
		char *msg = "Unknown";
		struct mce m, *final;
		int i;
		int worst = 0;
		int severity;

		/*
		* Establish sequential order between the CPUs entering the machine
		* check handler.
		*/
		int order = -1;

		/*
		* If no_way_out gets set, there is no safe way to recover from this
		* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
		*/
		int no_way_out = 0;

		/*
		* If kill_it gets set, there might be a way to recover from this
		* error.
		*/
		int kill_it = 0;
		DECLARE_BITMAP(toclear, MAX_NR_BANKS);
		DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
		char *msg = "Unknown";

		/*
		* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
		* on Intel.
		*/
		int lmce = 1;
		int cpu = smp_processor_id();

		/*
		* Cases where we avoid rendezvous handler timeout:
		* 1) If this CPU is offline.
		*
		* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
		* skip those CPUs which remain looping in the 1st kernel - see
		* crash_nmi_callback().
		*
		* Note: there still is a small window between kexec-ing and the new,
		* kdump kernel establishing a new #MC handler where a broadcasted MCE
		* might not get handled properly.
		*/
		if (cpu_is_offline(cpu) \|\|
		(crashing_cpu != -1 && crashing_cpu != cpu)) {
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
		if (mcgstatus & MCG_STATUS_RIPV) {
		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
		if (__mc_check_crashing_cpu(cpu))
		return;
		}
		}

		ist_enter(regs);

		this_cpu_inc(mce_exception_count);

		if (!cfg->banks)
		goto out;

		mce_gather_info(&m, regs);
		m.tsc = rdtsc();

		@@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
		order = mce_start(&no_way_out);
		}

		for (i = 0; i < cfg->banks; i++) {
		__clear_bit(i, toclear);
		if (!test_bit(i, valid_banks))
		continue;
		if (!mce_banks[i].ctl)
		continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;

		m.status = mce_rdmsrl(msr_ops.status(i));
		if ((m.status & MCI_STATUS_VAL) == 0)
		continue;

		/*
		* Non uncorrected or non signaled errors are handled by
		* machine_check_poll. Leave them alone, unless this panics.
		*/
		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
		!no_way_out)
		continue;

		/*
		* Set taint even when machine check was not enabled.
		*/
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

		severity = mce_severity(&m, cfg->tolerant, NULL, true);

		/*
		* When machine check was for corrected/deferred handler don't
		* touch, unless we're panicing.
		*/
		if ((severity == MCE_KEEP_SEVERITY \|\|
		severity == MCE_UCNA_SEVERITY) && !no_way_out)
		continue;
		__set_bit(i, toclear);
		if (severity == MCE_NO_SEVERITY) {
		/*
		* Machine check event was not enabled. Clear, but
		* ignore.
		*/
		continue;
		}

		mce_read_aux(&m, i);

		/* assuming valid severity level != 0 */
		m.severity = severity;

		mce_log(&m);

		if (severity > worst) {
		*final = m;
		worst = severity;
		}
		}

		/* mce_clear_state will clear final, save locally for use later /
		m = *final;
		__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);

		if (!no_way_out)
		mce_clear_state(toclear);
		@@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
		if (worst > 0)
		mce_report_event(regs);
		mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
		out:

		sync_core();

		if (worst != MCE_AR_SEVERITY && !kill_it)