Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ad6c2c2e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab:
 "For:

   - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac);
   - error injection support for i5100, when EDAC debug is enabled;
   - fix edac when it is loaded builtin (early init for the subsystem);
   - a "Firmware First" EDAC driver, allowing ghes to report errors via
     EDAC (ghes-edac).

  With regards to ghes-edac, this fixes a longstanding BZ at Red Hat
  that happens with Nehalem and Sandy Bridge CPUs: when both GHES and
  i7core_edac or sb_edac are running, the error reports are
  unpredictable, as both BIOS and OS race to access the registers.  With
  ghes-edac, the EDAC core will refuse to register any other concurrent
  memory error driver.

  This patchset moves the ghes struct definitions to a separate header
  file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to
  register/unregister and to report errors via ghes-edac.  Those changes
  were acked by ghes driver maintainer (Huang)."

* 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits)
  i5100_edac: convert to use simple_open()
  ghes_edac: fix to use list_for_each_entry_safe() when delete list items
  ghes_edac: Fix RAS tracing
  ghes_edac: Make it compliant with UEFI spec 2.3.1
  ghes_edac: Improve driver's printk messages
  ghes_edac: Don't credit the same memory dimm twice
  ghes_edac: do a better job of filling EDAC DIMM info
  ghes_edac: add support for reporting errors via EDAC
  ghes_edac: Register at EDAC core the BIOS report
  ghes: add the needed hooks for EDAC error report
  ghes: move structures/enum to a header file
  edac: add support for error type "Info"
  edac: add support for raw error reports
  edac: reduce stack pressure by using a pre-allocated buffer
  edac: lock module owner to avoid error report conflicts
  edac: remove proc_name from mci structure
  edac: add a new memory layer type
  edac: initialize the core earlier
  edac: better report error conditions in debug mode
  i5100_edac: Remove two checkpatch warnings
  ...
parents 19cc90f5 b0769891
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -2904,6 +2904,13 @@ W: bluesmoke.sourceforge.net
S:	Maintained
F:	drivers/edac/e7xxx_edac.c

EDAC-GHES
M:	Mauro Carvalho Chehab <mchehab@redhat.com>
L:	linux-edac@vger.kernel.org
W:	bluesmoke.sourceforge.net
S:	Maintained
F:	drivers/edac/ghes-edac.c

EDAC-I82443BXGX
M:	Tim Small <tim@buttersideup.com>
L:	linux-edac@vger.kernel.org
+21 −50
Original line number Diff line number Diff line
@@ -48,8 +48,8 @@
#include <linux/genalloc.h>
#include <linux/pci.h>
#include <linux/aer.h>
#include <acpi/apei.h>
#include <acpi/hed.h>

#include <acpi/ghes.h>
#include <asm/mce.h>
#include <asm/tlbflush.h>
#include <asm/nmi.h>
@@ -84,42 +84,6 @@
	((struct acpi_hest_generic_status *)				\
	 ((struct ghes_estatus_node *)(estatus_node) + 1))

/*
 * One struct ghes is created for each generic hardware error source.
 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
 * handler.
 *
 * estatus: memory buffer for error status block, allocated during
 * HEST parsing.
 */
#define GHES_TO_CLEAR		0x0001
#define GHES_EXITING		0x0002

struct ghes {
	struct acpi_hest_generic *generic;
	struct acpi_hest_generic_status *estatus;
	u64 buffer_paddr;
	unsigned long flags;
	union {
		struct list_head list;
		struct timer_list timer;
		unsigned int irq;
	};
};

struct ghes_estatus_node {
	struct llist_node llnode;
	struct acpi_hest_generic *generic;
};

struct ghes_estatus_cache {
	u32 estatus_len;
	atomic_t count;
	struct acpi_hest_generic *generic;
	unsigned long long time_in;
	struct rcu_head rcu;
};

bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);

@@ -333,13 +297,6 @@ static void ghes_fini(struct ghes *ghes)
	apei_unmap_generic_address(&ghes->generic->error_status_address);
}

enum {
	GHES_SEV_NO = 0x0,
	GHES_SEV_CORRECTED = 0x1,
	GHES_SEV_RECOVERABLE = 0x2,
	GHES_SEV_PANIC = 0x3,
};

static inline int ghes_severity(int severity)
{
	switch (severity) {
@@ -452,7 +409,8 @@ static void ghes_clear_estatus(struct ghes *ghes)
	ghes->flags &= ~GHES_TO_CLEAR;
}

static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
static void ghes_do_proc(struct ghes *ghes,
			 const struct acpi_hest_generic_status *estatus)
{
	int sev, sec_sev;
	struct acpi_hest_generic_data *gdata;
@@ -464,6 +422,8 @@ static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
				 CPER_SEC_PLATFORM_MEM)) {
			struct cper_sec_mem_err *mem_err;
			mem_err = (struct cper_sec_mem_err *)(gdata+1);
			ghes_edac_report_mem_error(ghes, sev, mem_err);

#ifdef CONFIG_X86_MCE
			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
						  mem_err);
@@ -682,7 +642,7 @@ static int ghes_proc(struct ghes *ghes)
		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
	}
	ghes_do_proc(ghes->estatus);
	ghes_do_proc(ghes, ghes->estatus);
out:
	ghes_clear_estatus(ghes);
	return 0;
@@ -775,7 +735,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
		len = apei_estatus_len(estatus);
		node_len = GHES_ESTATUS_NODE_LEN(len);
		ghes_do_proc(estatus);
		ghes_do_proc(estatus_node->ghes, estatus);
		if (!ghes_estatus_cached(estatus)) {
			generic = estatus_node->generic;
			if (ghes_print_estatus(NULL, generic, estatus))
@@ -864,6 +824,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
						      node_len);
		if (estatus_node) {
			estatus_node->ghes = ghes;
			estatus_node->generic = ghes->generic;
			estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
			memcpy(estatus, ghes->estatus, len);
@@ -942,6 +903,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
		ghes = NULL;
		goto err;
	}

	rc = ghes_edac_register(ghes, &ghes_dev->dev);
	if (rc < 0)
		goto err;

	switch (generic->notify.type) {
	case ACPI_HEST_NOTIFY_POLLED:
		ghes->timer.function = ghes_poll_func;
@@ -954,13 +920,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
		if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
			pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
			       generic->header.source_id);
			goto err;
			goto err_edac_unreg;
		}
		if (request_irq(ghes->irq, ghes_irq_func,
				0, "GHES IRQ", ghes)) {
			pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
			       generic->header.source_id);
			goto err;
			goto err_edac_unreg;
		}
		break;
	case ACPI_HEST_NOTIFY_SCI:
@@ -986,6 +952,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
	platform_set_drvdata(ghes_dev, ghes);

	return 0;
err_edac_unreg:
	ghes_edac_unregister(ghes);
err:
	if (ghes) {
		ghes_fini(ghes);
@@ -1038,6 +1006,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
	}

	ghes_fini(ghes);

	ghes_edac_unregister(ghes);

	kfree(ghes);

	platform_set_drvdata(ghes_dev, NULL);
+23 −0
Original line number Diff line number Diff line
@@ -80,6 +80,29 @@ config EDAC_MM_EDAC
	  occurred so that a particular failing memory module can be
	  replaced.  If unsure, select 'Y'.

config EDAC_GHES
	bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
	depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
	default y
	help
	  Not all machines support hardware-driven error report. Some of those
	  provide a BIOS-driven error report mechanism via ACPI, using the
	  APEI/GHES driver. By enabling this option, the error reports provided
	  by GHES are sent to userspace via the EDAC API.

	  When this option is enabled, it will disable the hardware-driven
	  mechanisms, if a GHES BIOS is detected, entering into the
	  "Firmware First" mode.

	  It should be noticed that keeping both GHES and a hardware-driven
	  error mechanism won't work well, as BIOS will race with OS, while
	  reading the error registers. So, if you want to not use "Firmware
	  first" GHES error mechanism, you should disable GHES either at
	  compilation time or by passing "ghes.disable=1" Kernel parameter
	  at boot time.

	  In doubt, say 'Y'.

config EDAC_AMD64
	tristate "AMD64 (Opteron, Athlon64) K8, F10h"
	depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
+1 −0
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ ifdef CONFIG_PCI
edac_core-y	+= edac_pci.o edac_pci_sysfs.o
endif

obj-$(CONFIG_EDAC_GHES)			+= ghes_edac.o
obj-$(CONFIG_EDAC_MCE_INJ)		+= mce_amd_inj.o

edac_mce_amd-y				:= mce_amd.o
+5 −0
Original line number Diff line number Diff line
@@ -453,6 +453,11 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
				      unsigned long page);

void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
			      struct mem_ctl_info *mci,
			      struct edac_raw_error_desc *e);

void edac_mc_handle_error(const enum hw_event_mc_err_type type,
			  struct mem_ctl_info *mci,
			  const u16 error_count,
Loading