Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b05e1310 authored by Kyle Yan's avatar Kyle Yan Committed by Runmin Wang
Browse files

drivers: edac: Add Cache Error Reporting driver for Kryo processors



Cache Error Reporting driver receives error interrupts for Single Bit and
Double Bit Errors, checks the corresponding syndrome registers and takes
action based on configuration options. Optional polling of syndrome
registers with single error and double bit errors configurable as panic.

Change-Id: I025037da5c5ac6f5520b683af69c462663c1e4f0
Signed-off-by: default avatarKyle Yan <kyan@codeaurora.org>
Signed-off-by: default avatarRunmin Wang <runminw@codeaurora.org>
parent 577bba76
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
/* Copyright (c) 2017, The Linux Foundation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#ifndef ASM_KRYO_EDAC_H
#define ASM_KRYO_EDAC_H

#if defined(CONFIG_EDAC_KRYO_ARM64)
void kryo_poll_cache_errors(void *info);
#else
static inline void kryo_poll_cache_errors(void *info) { }
#endif

#endif
+2 −0
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@
#include <asm/system_misc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/kryo-arm64-edac.h>

struct fault_info {
	int	(*fn)(unsigned long addr, unsigned int esr,
@@ -519,6 +520,7 @@ static int do_alignment_fault(unsigned long addr, unsigned int esr,
 */
static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
	kryo_poll_cache_errors(NULL);
	return 1;
}

+42 −0
Original line number Diff line number Diff line
@@ -450,6 +450,48 @@ config EDAC_SYNOPSYS
	  Support for error detection and correction on the Synopsys DDR
	  memory controller.

config EDAC_KRYO_ARM64
	depends on ARM64
	tristate "ARM KRYO Gold and Silver L1/L2/L3/SCU Caches"
	help
	   Support for error detection and correction on the
	   Kryo3xx Gold and Silver CPUs. Reports errors caught by Kryo3xx
	   ECC mechanism.
	   For debugging issues having to do with stability and overall system
	   health, you should probably say 'Y' here.

config EDAC_KRYO_ARM64_POLL
	depends on EDAC_KRYO_ARM64
	bool "Poll on kryo ECC registers - kryo"
	help
	   This option chooses whether or not you want to poll on the Kryo3xx
	   ECC registers. When this is enabled, the polling rate can be set as
	   a module parameter. By default, it will call the polling function
	   every second.
	   This option should only be used if the associated interrupt lines
	   are not enabled.

config EDAC_KRYO_ARM64_PANIC_ON_CE
	depends on EDAC_KRYO_ARM64
	bool "Panic on correctable errors - kryo"
	help
	   Forcibly cause a kernel panic if an correctable error (CE) is
	   detected, even though the error is (by definition) correctable and
	   would otherwise result in no adverse system effects. This can reduce
	   debugging times on hardware which may be operating at voltages or
	   frequencies outside normal specification.
	   For production builds, you should definitely say 'N' here.

config EDAC_KRYO_ARM64_PANIC_ON_UE
	depends on EDAC_KRYO_ARM64
	bool "Panic on uncorrectable errors - kryo"
	help
	   Forcibly cause a kernel panic if an uncorrectable error (UE) is
	   detected. This can reduce debugging times on hardware which may be
	   operating at voltages or frequencies outside normal specification.
	   For production builds, you should probably say 'N' here.


config EDAC_XGENE
	tristate "APM X-Gene SoC"
	depends on (ARM64 || COMPILE_TEST)
+1 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ obj-$(CONFIG_EDAC_IE31200) += ie31200_edac.o
obj-$(CONFIG_EDAC_X38)			+= x38_edac.o
obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
obj-$(CONFIG_EDAC_KRYO_ARM64)		+= kryo_arm64_edac.o

amd64_edac_mod-y := amd64_edac.o
amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o
+463 −0
Original line number Diff line number Diff line
/* Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/kernel.h>
#include <linux/edac.h>
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/of_irq.h>

#include <asm/cputype.h>

#include "edac_mc.h"
#include "edac_device.h"

#ifdef CONFIG_EDAC_KRYO_ARM64_POLL
static int poll_msec = 1000;
module_param(poll_msec, int, 0444);
#endif

#ifdef CONFIG_EDAC_KRYO_ARM64_PANIC_ON_CE
#define ARM64_ERP_PANIC_ON_CE 1
#else
#define ARM64_ERP_PANIC_ON_CE 0
#endif

#ifdef CONFIG_EDAC_KRYO_ARM64_PANIC_ON_UE
#define ARM64_ERP_PANIC_ON_UE 1
#else
#define ARM64_ERP_PANIC_ON_UE 0
#endif

#define L1 0x0
#define L2 0x1
#define L3 0x2

#define EDAC_CPU	"kryo_edac"

#define KRYO_ERRXSTATUS_VALID(a)	((a >> 30) & 0x1)
#define KRYO_ERRXSTATUS_UE(a)	((a >> 29) & 0x1)
#define KRYO_ERRXSTATUS_SERR(a)	(a & 0xFF)

#define KRYO_ERRXMISC_LVL(a)		((a >> 1) & 0x7)
#define KRYO_ERRXMISC_WAY(a)		((a >> 28) & 0xF)

static inline void set_errxctlr_el1(void)
{
	u64 val = 0x10f;

	asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (val));
}

static inline void set_errxmisc_overflow(void)
{
	u64 val = 0x7F7F00000000;

	asm volatile("msr s3_0_c5_c5_0, %0" : : "r" (val));
}

static inline void write_errselr_el1(u64 val)
{
	asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (val));
}

static inline u64 read_errxstatus_el1(void)
{
	u64 val;

	asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (val));
	return val;
}

static inline u64 read_errxmisc_el1(void)
{
	u64 val;

	asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (val));
	return val;
}

static inline void clear_errxstatus_valid(u64 val)
{
	asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (val));
}

struct errors_edac {
	const char * const msg;
	void (*func)(struct edac_device_ctl_info *edac_dev,
			int inst_nr, int block_nr, const char *msg);
};

static const struct errors_edac errors[] = {
	{"Kryo3xx L1 Correctable Error", edac_device_handle_ce },
	{"Kryo3xx L1 Uncorrectable Error", edac_device_handle_ue },
	{"Kryo3xx L2 Correctable Error", edac_device_handle_ce },
	{"Kryo3xx L2 Uncorrectable Error", edac_device_handle_ue },
	{"L3 Correctable Error", edac_device_handle_ce },
	{"L3 Uncorrectable Error", edac_device_handle_ue },
};

#define KRYO_L1_CE 0
#define KRYO_L1_UE 1
#define KRYO_L2_CE 2
#define KRYO_L2_UE 3
#define KRYO_L3_CE 4
#define KRYO_L3_UE 5

#define DATA_BUF_ERR		0x2
#define CACHE_DATA_ERR		0x6
#define CACHE_TAG_DIRTY_ERR	0x7
#define TLB_PARITY_ERR		0x8
#define BUS_ERROR		0x18

struct erp_drvdata {
	struct edac_device_ctl_info *edev_ctl;
	struct erp_drvdata __percpu **erp_cpu_drvdata;
	int ppi;
};

static struct erp_drvdata *panic_handler_drvdata;

static DEFINE_SPINLOCK(local_handler_lock);

static void l1_l2_irq_enable(void *info)
{
	int irq = *(int *)info;

	enable_percpu_irq(irq, IRQ_TYPE_LEVEL_HIGH);
}

static int request_erp_irq(struct platform_device *pdev, const char *propname,
			const char *desc, irq_handler_t handler,
			void *ed, int percpu)
{
	int rc;
	struct resource *r;
	struct erp_drvdata *drv = ed;

	r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, propname);

	if (!r) {
		pr_err("ARM64 CPU ERP: Could not find <%s> IRQ property. Proceeding anyway.\n",
			propname);
		goto out;
	}

	if (!percpu) {
		rc = devm_request_threaded_irq(&pdev->dev, r->start, NULL,
					       handler,
					       IRQF_ONESHOT | IRQF_TRIGGER_HIGH,
					       desc,
					       ed);

		if (rc) {
			pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
			       (int) r->start, rc, propname, desc);
			goto out;
		}

	} else {
		drv->erp_cpu_drvdata = alloc_percpu(struct erp_drvdata *);
		if (!drv->erp_cpu_drvdata) {
			pr_err("Failed to allocate percpu erp data\n");
			goto out;
		}

		*raw_cpu_ptr(drv->erp_cpu_drvdata) = drv;
		rc = request_percpu_irq(r->start, handler, desc,
				drv->erp_cpu_drvdata);

		if (rc) {
			pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
			       (int) r->start, rc, propname, desc);
			goto out_free;
		}

		drv->ppi = r->start;
		on_each_cpu(l1_l2_irq_enable, &(r->start), 1);
	}

	return 0;

out_free:
	free_percpu(drv->erp_cpu_drvdata);
	drv->erp_cpu_drvdata = NULL;
out:
	return -EINVAL;
}

static void dump_err_reg(int errorcode, int level, u64 errxstatus, u64 errxmisc,
	struct edac_device_ctl_info *edev_ctl)
{
	edac_printk(KERN_CRIT, EDAC_CPU, "ERRXSTATUS_EL1: %llx\n", errxstatus);
	edac_printk(KERN_CRIT, EDAC_CPU, "ERRXMISC_EL1: %llx\n", errxmisc);
	edac_printk(KERN_CRIT, EDAC_CPU, "Cache level: L%d\n", level + 1);

	switch (KRYO_ERRXSTATUS_SERR(errxstatus)) {
	case DATA_BUF_ERR:
		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from internal data buffer\n");
		break;

	case CACHE_DATA_ERR:
		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache data RAM\n");
		break;

	case CACHE_TAG_DIRTY_ERR:
		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache tag or dirty RAM\n");
		break;

	case TLB_PARITY_ERR:
		edac_printk(KERN_CRIT, EDAC_CPU, "Parity error on TLB RAM\n");
		break;

	case BUS_ERROR:
		edac_printk(KERN_CRIT, EDAC_CPU, "Bus Error\n");
		break;
	}

	if (level == L3)
		edac_printk(KERN_CRIT, EDAC_CPU,
			"Way: %d\n", (int) KRYO_ERRXMISC_WAY(errxmisc));
	else
		edac_printk(KERN_CRIT, EDAC_CPU,
			"Way: %d\n", (int) KRYO_ERRXMISC_WAY(errxmisc) >> 2);
	errors[errorcode].func(edev_ctl, smp_processor_id(),
				level, errors[errorcode].msg);
}

static void kryo_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc,
	struct edac_device_ctl_info *edev_ctl)
{
	switch (KRYO_ERRXMISC_LVL(errxmisc)) {
	case L1:
		if (KRYO_ERRXSTATUS_UE(errxstatus))
			dump_err_reg(KRYO_L1_UE, L1, errxstatus, errxmisc,
				edev_ctl);
		else
			dump_err_reg(KRYO_L1_CE, L1, errxstatus, errxmisc,
				edev_ctl);
		break;

	case L2:
		if (KRYO_ERRXSTATUS_UE(errxstatus))
			dump_err_reg(KRYO_L2_UE, L2, errxstatus, errxmisc,
				edev_ctl);
		else
			dump_err_reg(KRYO_L2_CE, L2, errxstatus, errxmisc,
				edev_ctl);
		break;
	}

}

static void kryo_check_l1_l2_ecc(void *info)
{
	struct edac_device_ctl_info *edev_ctl = info;
	u64 errxstatus = 0;
	u64 errxmisc = 0;
	unsigned long flags;

	spin_lock_irqsave(&local_handler_lock, flags);
	write_errselr_el1(0);
	errxstatus = read_errxstatus_el1();
	if (KRYO_ERRXSTATUS_VALID(errxstatus)) {
		errxmisc = read_errxmisc_el1();
		edac_printk(KERN_CRIT, EDAC_CPU,
		"Kryo3xx CPU%d detected a L1/L2 cache error\n",
		smp_processor_id());

		kryo_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl);
		clear_errxstatus_valid(errxstatus);
	}
	spin_unlock_irqrestore(&local_handler_lock, flags);
}

static void kryo_check_l3_scu_error(struct edac_device_ctl_info *edev_ctl)
{
	u64 errxstatus = 0;
	u64 errxmisc = 0;
	unsigned long flags;

	spin_lock_irqsave(&local_handler_lock, flags);
	write_errselr_el1(1);
	errxstatus = read_errxstatus_el1();
	errxmisc = read_errxmisc_el1();

	if (KRYO_ERRXSTATUS_VALID(errxstatus) &&
		KRYO_ERRXMISC_LVL(errxmisc) == L3) {
		if (KRYO_ERRXSTATUS_UE(errxstatus)) {
			edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 uncorrectable error\n");
			dump_err_reg(KRYO_L3_UE, L3, errxstatus, errxmisc,
				edev_ctl);
		} else {
			edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 correctable error\n");
			dump_err_reg(KRYO_L3_CE, L3, errxstatus, errxmisc,
				edev_ctl);
		}

		clear_errxstatus_valid(errxstatus);
	}
	spin_unlock_irqrestore(&local_handler_lock, flags);
}

void kryo_poll_cache_errors(struct edac_device_ctl_info *edev_ctl)
{
	int cpu;

	if (edev_ctl == NULL)
		edev_ctl = panic_handler_drvdata->edev_ctl;

	kryo_check_l3_scu_error(edev_ctl);
	for_each_possible_cpu(cpu)
		smp_call_function_single(cpu, kryo_check_l1_l2_ecc,
			edev_ctl, 0);
}

static irqreturn_t kryo_l1_l2_handler(int irq, void *drvdata)
{
	kryo_check_l1_l2_ecc(panic_handler_drvdata->edev_ctl);
	return IRQ_HANDLED;
}

static irqreturn_t kryo_l3_scu_handler(int irq, void *drvdata)
{
	struct erp_drvdata *drv = drvdata;
	struct edac_device_ctl_info *edev_ctl = drv->edev_ctl;

	kryo_check_l3_scu_error(edev_ctl);
	return IRQ_HANDLED;
}

static void initialize_registers(void *info)
{
	set_errxctlr_el1();
	set_errxmisc_overflow();
}

static int kryo_cpu_erp_probe(struct platform_device *pdev)
{
	struct device *dev = &pdev->dev;
	struct erp_drvdata *drv;
	int rc = 0;
	int fail = 0;
	int cpu;

	for_each_possible_cpu(cpu)
		smp_call_function_single(cpu, initialize_registers, NULL, 1);


	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);

	if (!drv)
		return -ENOMEM;

	drv->edev_ctl = edac_device_alloc_ctl_info(0, "cpu",
					num_possible_cpus(), "L", 3, 1, NULL, 0,
					edac_device_alloc_index());

	if (!drv->edev_ctl)
		return -ENOMEM;

	#ifdef CONFIG_EDAC_KRYO_ARM64_POLL
	drv->edev_ctl->edac_check = kryo_poll_cache_errors;
	drv->edev_ctl->poll_msec = poll_msec;
	drv->edev_ctl->defer_work = 1;
	#endif

	drv->edev_ctl->dev = dev;
	drv->edev_ctl->mod_name = dev_name(dev);
	drv->edev_ctl->dev_name = dev_name(dev);
	drv->edev_ctl->ctl_name = "cache";
	drv->edev_ctl->panic_on_ce = ARM64_ERP_PANIC_ON_CE;
	drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
	platform_set_drvdata(pdev, drv);

	rc = edac_device_add_device(drv->edev_ctl);
	if (rc)
		goto out_mem;

	panic_handler_drvdata = drv;

	if (request_erp_irq(pdev, "l1-l2-faultirq",
			"KRYO L1-L2 ECC FAULTIRQ",
			kryo_l1_l2_handler, drv, 1))
		fail++;

	if (request_erp_irq(pdev, "l3-scu-faultirq",
			"KRYO L3-SCU ECC FAULTIRQ",
			kryo_l3_scu_handler, drv, 0))
		fail++;

	if (fail == of_irq_count(dev->of_node)) {
		pr_err("KRYO ERP: Could not request any IRQs. Giving up.\n");
		rc = -ENODEV;
		goto out_dev;
	}

	return 0;

out_dev:
	edac_device_del_device(dev);
out_mem:
	edac_device_free_ctl_info(drv->edev_ctl);
	return rc;
}

static int kryo_cpu_erp_remove(struct platform_device *pdev)
{
	struct erp_drvdata *drv = dev_get_drvdata(&pdev->dev);
	struct edac_device_ctl_info *edac_ctl = drv->edev_ctl;


	if (drv->erp_cpu_drvdata != NULL) {
		free_percpu_irq(drv->ppi, drv->erp_cpu_drvdata);
		free_percpu(drv->erp_cpu_drvdata);
	}

	edac_device_del_device(edac_ctl->dev);
	edac_device_free_ctl_info(edac_ctl);

	return 0;
}

static const struct of_device_id kryo_cpu_erp_match_table[] = {
	{ .compatible = "arm,arm64-kryo-cpu-erp" },
	{ }
};

static struct platform_driver kryo_cpu_erp_driver = {
	.probe = kryo_cpu_erp_probe,
	.remove = kryo_cpu_erp_remove,
	.driver = {
		.name = "kryo_cpu_cache_erp",
		.owner = THIS_MODULE,
		.of_match_table = of_match_ptr(kryo_cpu_erp_match_table),
	},
};

static int __init kryo_cpu_erp_init(void)
{
	return platform_driver_register(&kryo_cpu_erp_driver);
}
module_init(kryo_cpu_erp_init);

static void __exit kryo_cpu_erp_exit(void)
{
	platform_driver_unregister(&kryo_cpu_erp_driver);
}
module_exit(kryo_cpu_erp_exit);

MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("Kryo3xx EDAC driver");