Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fab5669d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS changes from Ingo Molnar:

 - SCI reporting for other error types not only correctable ones

 - GHES cleanups

 - Add the functionality to override error reporting agents as some
   machines are sporting a new extended error logging capability which,
   if done properly in the BIOS, makes a corresponding EDAC module
   redundant

 - PCIe AER tracepoint severity levels fix

 - Error path correction for the mce device init

 - MCE timer fix

 - Add more flexibility to the error injection (EINJ) debugfs interface

* 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86, mce: Fix mce_start_timer semantics
  ACPI, APEI, GHES: Cleanup ghes memory error handling
  ACPI, APEI: Cleanup alignment-aware accesses
  ACPI, APEI, GHES: Do not report only correctable errors with SCI
  ACPI, APEI, EINJ: Changes to the ACPI/APEI/EINJ debugfs interface
  ACPI, eMCA: Combine eMCA/EDAC event reporting priority
  EDAC, sb_edac: Modify H/W event reporting policy
  EDAC: Add an edac_report parameter to EDAC
  PCI, AER: Fix severity usage in aer trace event
  x86, mce: Call put_device on device_register failure
parents 74e8ee82 b769e014
Loading
Loading
Loading
Loading
+18 −1
Original line number Original line Diff line number Diff line
@@ -45,11 +45,22 @@ directory apei/einj. The following files are provided.
  injection. Before this, please specify all necessary error
  injection. Before this, please specify all necessary error
  parameters.
  parameters.


- flags
  Present for kernel version 3.13 and above. Used to specify which
  of param{1..4} are valid and should be used by BIOS during injection.
  Value is a bitmask as specified in ACPI5.0 spec for the
  SET_ERROR_TYPE_WITH_ADDRESS data structure:
	Bit 0 - Processor APIC field valid (see param3 below)
	Bit 1 - Memory address and mask valid (param1 and param2)
	Bit 2 - PCIe (seg,bus,dev,fn) valid (param4 below)
  If set to zero, legacy behaviour is used where the type of injection
  specifies just one bit set, and param1 is multiplexed.

- param1
- param1
  This file is used to set the first error parameter value. Effect of
  This file is used to set the first error parameter value. Effect of
  parameter depends on error_type specified. For example, if error
  parameter depends on error_type specified. For example, if error
  type is memory related type, the param1 should be a valid physical
  type is memory related type, the param1 should be a valid physical
  memory address.
  memory address. [Unless "flag" is set - see above]


- param2
- param2
  This file is used to set the second error parameter value. Effect of
  This file is used to set the second error parameter value. Effect of
@@ -58,6 +69,12 @@ directory apei/einj. The following files are provided.
  address mask. Linux requires page or narrower granularity, say,
  address mask. Linux requires page or narrower granularity, say,
  0xfffffffffffff000.
  0xfffffffffffff000.


- param3
  Used when the 0x1 bit is set in "flag" to specify the APIC id

- param4
  Used when the 0x4 bit is set in "flag" to specify target PCIe device

- notrigger
- notrigger
  The EINJ mechanism is a two step process. First inject the error, then
  The EINJ mechanism is a two step process. First inject the error, then
  perform some actions to trigger it. Setting "notrigger" to 1 skips the
  perform some actions to trigger it. Setting "notrigger" to 1 skips the
+8 −0
Original line number Original line Diff line number Diff line
@@ -890,6 +890,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.


			The xen output can only be used by Xen PV guests.
			The xen output can only be used by Xen PV guests.


	edac_report=	[HW,EDAC] Control how to report EDAC event
			Format: {"on" | "off" | "force"}
			on: enable EDAC to report H/W event. May be overridden
			by other higher priority error reporting module.
			off: disable H/W event reporting through EDAC.
			force: enforce the use of EDAC to report H/W event.
			default: on.

	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
			ekgdboc=kbd
			ekgdboc=kbd


+10 −4
Original line number Original line Diff line number Diff line
@@ -33,22 +33,28 @@
#include <linux/acpi.h>
#include <linux/acpi.h>
#include <linux/cper.h>
#include <linux/cper.h>
#include <acpi/apei.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
#include <asm/mce.h>
#include <asm/mce.h>


#include "mce-internal.h"
#include "mce-internal.h"


void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
{
	struct mce m;
	struct mce m;


	/* Only corrected MC is reported */
	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
		return;
		return;


	mce_setup(&m);
	mce_setup(&m);
	m.bank = 1;
	m.bank = 1;
	/* Fake a memory read corrected error with unknown channel */
	/* Fake a memory read error with unknown channel */
	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;

	if (severity >= GHES_SEV_RECOVERABLE)
		m.status |= MCI_STATUS_UC;
	if (severity >= GHES_SEV_PANIC)
		m.status |= MCI_STATUS_PCC;

	m.addr = mem_err->physical_addr;
	m.addr = mem_err->physical_addr;
	mce_log(&m);
	mce_log(&m);
	mce_notify_irq();
	mce_notify_irq();
+7 −5
Original line number Original line Diff line number Diff line
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)


static void mce_start_timer(unsigned int cpu, struct timer_list *t)
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
{
	unsigned long iv = mce_adjust_timer(check_interval * HZ);
	unsigned long iv = check_interval * HZ;

	__this_cpu_write(mce_next_interval, iv);


	if (mca_cfg.ignore_ce || !iv)
	if (mca_cfg.ignore_ce || !iv)
		return;
		return;


	per_cpu(mce_next_interval, cpu) = iv;

	t->expires = round_jiffies(jiffies + iv);
	t->expires = round_jiffies(jiffies + iv);
	add_timer_on(t, smp_processor_id());
	add_timer_on(t, cpu);
}
}


static void __mcheck_cpu_init_timer(void)
static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
	dev->release = &mce_device_release;
	dev->release = &mce_device_release;


	err = device_register(dev);
	err = device_register(dev);
	if (err)
	if (err) {
		put_device(dev);
		return err;
		return err;
	}


	for (i = 0; mce_device_attrs[i]; i++) {
	for (i = 0; mce_device_attrs[i]; i++) {
		err = device_create_file(dev, mce_device_attrs[i]);
		err = device_create_file(dev, mce_device_attrs[i]);
+16 −2
Original line number Original line Diff line number Diff line
@@ -12,6 +12,7 @@
#include <acpi/acpi_bus.h>
#include <acpi/acpi_bus.h>
#include <linux/cper.h>
#include <linux/cper.h>
#include <linux/ratelimit.h>
#include <linux/ratelimit.h>
#include <linux/edac.h>
#include <asm/cpu.h>
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/mce.h>


@@ -43,6 +44,8 @@ struct extlog_l1_head {
	u8  rev1[12];
	u8  rev1[12];
};
};


static int old_edac_report_status;

static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";
static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";


/* L1 table related physical address */
/* L1 table related physical address */
@@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,


	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);


	return NOTIFY_DONE;
	return NOTIFY_STOP;
}
}


static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
@@ -231,8 +234,12 @@ static int __init extlog_init(void)
	u64 cap;
	u64 cap;
	int rc;
	int rc;


	rc = -ENODEV;
	if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
		pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
		return -EPERM;
	}


	rc = -ENODEV;
	rdmsrl(MSR_IA32_MCG_CAP, cap);
	rdmsrl(MSR_IA32_MCG_CAP, cap);
	if (!(cap & MCG_ELOG_P))
	if (!(cap & MCG_ELOG_P))
		return rc;
		return rc;
@@ -287,6 +294,12 @@ static int __init extlog_init(void)
	if (elog_buf == NULL)
	if (elog_buf == NULL)
		goto err_release_elog;
		goto err_release_elog;


	/*
	 * eMCA event report method has higher priority than EDAC method,
	 * unless EDAC event report method is mandatory.
	 */
	old_edac_report_status = get_edac_report_status();
	set_edac_report_status(EDAC_REPORTING_DISABLED);
	mce_register_decode_chain(&extlog_mce_dec);
	mce_register_decode_chain(&extlog_mce_dec);
	/* enable OS to be involved to take over management from BIOS */
	/* enable OS to be involved to take over management from BIOS */
	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,6 +321,7 @@ static int __init extlog_init(void)


static void __exit extlog_exit(void)
static void __exit extlog_exit(void)
{
{
	set_edac_report_status(old_edac_report_status);
	mce_unregister_decode_chain(&extlog_mce_dec);
	mce_unregister_decode_chain(&extlog_mce_dec);
	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
	if (extlog_l1_addr)
	if (extlog_l1_addr)
Loading