Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d334a491 authored by Huang Ying's avatar Huang Ying Committed by Len Brown
Browse files

ACPI, APEI, Generic Hardware Error Source memory error support



Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

Now, only SCI notification type and memory errors are supported. More
notification type and hardware error type will be added later. These
memory errors are reported to user space through /dev/mcelog via
faking a corrected Machine Check, so that the error memory page can be
offlined by /sbin/mcelog if the error count for one page is beyond the
threshold.

On some machines, Machine Check can not report physical address for
some corrected memory errors, but GHES can do that. So this simplified
GHES is implemented firstly.

Signed-off-by: default avatarHuang Ying <ying.huang@intel.com>
Signed-off-by: default avatarAndi Kleen <ak@linux.intel.com>
Signed-off-by: default avatarLen Brown <len.brown@intel.com>
parent 06d65dea
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
static inline void mcheck_intel_therm_init(void) { }
#endif

/*
 * Used by APEI to report memory error via /dev/mcelog
 */

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
				      struct cper_sec_mem_err *mem_err);

#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
+2 −0
Original line number Diff line number Diff line
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o

obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o

obj-$(CONFIG_ACPI_APEI)		+= mce-apei.o
+52 −0
Original line number Diff line number Diff line
/*
 * Bridge between MCE and APEI
 *
 * On some machine, corrected memory errors are reported via APEI
 * generic hardware error source (GHES) instead of corrected Machine
 * Check. These corrected memory errors can be reported to user space
 * through /dev/mcelog via faking a corrected Machine Check, so that
 * the error memory page can be offlined by /sbin/mcelog if the error
 * count for one page is beyond the threshold.
 *
 * Copyright 2010 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version
 * 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
#include <asm/mce.h>

#include "mce-internal.h"

void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
{
	struct mce m;

	/* Only corrected MC is reported */
	if (!corrected)
		return;

	mce_setup(&m);
	m.bank = 1;
	/* Fake a memory read corrected error with unknown channel */
	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
	m.addr = mem_err->physical_addr;
	mce_log(&m);
	mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+14 −0
Original line number Diff line number Diff line
@@ -7,6 +7,20 @@ config ACPI_APEI
	  especially. In addition it supports error serialization and
	  error injection.

config ACPI_APEI_GHES
	tristate "APEI Generic Hardware Error Source"
	depends on ACPI_APEI && X86
	select ACPI_HED
	help
	  Generic Hardware Error Source provides a way to report
	  platform hardware errors (such as that from chipset). It
	  works in so called "Firmware First" mode, that is, hardware
	  errors are reported to firmware firstly, then reported to
	  Linux by firmware. This way, some non-standard hardware
	  error registers or non-standard hardware link can be checked
	  by firmware to produce more valuable hardware error
	  information for Linux.

config ACPI_APEI_EINJ
	tristate "APEI Error INJection (EINJ)"
	depends on ACPI_APEI && DEBUG_FS
+1 −0
Original line number Diff line number Diff line
obj-$(CONFIG_ACPI_APEI)		+= apei.o
obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o

apei-y := apei-base.o hest.o cper.o
Loading