Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b63a0ffe authored by Mahesh Salgaonkar's avatar Mahesh Salgaonkar Committed by Benjamin Herrenschmidt
Browse files

powerpc/powernv: Machine check exception handling.



Add basic error handling in machine check exception handler.

- If MSR_RI isn't set, we can not recover.
- Check if disposition set to OpalMCE_DISPOSITION_RECOVERED.
- Check if address at fault is inside kernel address space, if not then send
  SIGBUS to process if we hit exception when in userspace.
- If address at fault is not provided then and if we get a synchronous machine
  check while in userspace then kill the task.

Signed-off-by: default avatarMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 28446de2
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -193,5 +193,6 @@ extern void release_mce_event(void);
extern void machine_check_queue_event(void);
extern void machine_check_process_queued_event(void);
extern void machine_check_print_event_info(struct machine_check_event *evt);
extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);

#endif /* __ASM_PPC64_MCE_H__ */
+27 −0
Original line number Diff line number Diff line
@@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt)
		break;
	}
}

uint64_t get_mce_fault_addr(struct machine_check_event *evt)
{
	switch (evt->error_type) {
	case MCE_ERROR_TYPE_UE:
		if (evt->u.ue_error.effective_address_provided)
			return evt->u.ue_error.effective_address;
		break;
	case MCE_ERROR_TYPE_SLB:
		if (evt->u.slb_error.effective_address_provided)
			return evt->u.slb_error.effective_address;
		break;
	case MCE_ERROR_TYPE_ERAT:
		if (evt->u.erat_error.effective_address_provided)
			return evt->u.erat_error.effective_address;
		break;
	case MCE_ERROR_TYPE_TLB:
		if (evt->u.tlb_error.effective_address_provided)
			return evt->u.tlb_error.effective_address;
		break;
	default:
	case MCE_ERROR_TYPE_UNKNOWN:
		break;
	}
	return 0;
}
EXPORT_SYMBOL(get_mce_fault_addr);
+42 −1
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/kobject.h>
#include <asm/opal.h>
#include <asm/firmware.h>
@@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
	return written;
}

static int opal_recover_mce(struct pt_regs *regs,
					struct machine_check_event *evt)
{
	int recovered = 0;
	uint64_t ea = get_mce_fault_addr(evt);

	if (!(regs->msr & MSR_RI)) {
		/* If MSR_RI isn't set, we cannot recover */
		recovered = 0;
	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
		/* Platform corrected itself */
		recovered = 1;
	} else if (ea && !is_kernel_addr(ea)) {
		/*
		 * Faulting address is not in kernel text. We should be fine.
		 * We need to find which process uses this address.
		 * For now, kill the task if we have received exception when
		 * in userspace.
		 *
		 * TODO: Queue up this address for hwpoisioning later.
		 */
		if (user_mode(regs) && !is_global_init(current)) {
			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
			recovered = 1;
		} else
			recovered = 0;
	} else if (user_mode(regs) && !is_global_init(current) &&
		evt->severity == MCE_SEV_ERROR_SYNC) {
		/*
		 * If we have received a synchronous error when in userspace
		 * kill the task.
		 */
		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
		recovered = 1;
	}
	return recovered;
}

int opal_machine_check(struct pt_regs *regs)
{
	struct machine_check_event evt;
@@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs)
	}
	machine_check_print_event_info(&evt);

	return evt.severity == MCE_SEV_FATAL ? 0 : 1;
	if (opal_recover_mce(regs, &evt))
		return 1;
	return 0;
}

static irqreturn_t opal_interrupt(int irq, void *data)