Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 55672ecf authored by Mahesh Salgaonkar's avatar Mahesh Salgaonkar Committed by Benjamin Herrenschmidt
Browse files

powerpc/book3s: Recover from MC in sapphire on SCOM read via MMIO.



Detect and recover from machine check when inside opal on a special
scom load instructions. On specific SCOM read via MMIO we may get a machine
check exception with SRR0 pointing inside opal. To recover from MC
in this scenario, get a recovery instruction address and return to it from
MC.

OPAL will export the machine check recoverable ranges through
device tree node mcheck-recoverable-ranges under ibm,opal:

# hexdump /proc/device-tree/ibm,opal/mcheck-recoverable-ranges
0000000 0000 0000 3000 2804 0000 000c 0000 0000
0000010 3000 2814 0000 0000 3000 27f0 0000 000c
0000020 0000 0000 3000 2814 xxxx xxxx xxxx xxxx
0000030 llll llll yyyy yyyy yyyy yyyy
...
...
#

where:
	xxxx xxxx xxxx xxxx = Starting instruction address
	llll llll           = Length of the address range.
	yyyy yyyy yyyy yyyy = recovery address

Each recoverable address range entry is (start address, len,
recovery address), 2 cells each for start and recovery address, 1 cell for
len, totalling 5 cells per entry. During kernel boot time, build up the
recovery table with the list of recovery ranges from device-tree node which
will be used during machine check exception to recover from MMIO SCOM UE.

Signed-off-by: default avatarMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent d2a36071
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -170,6 +170,9 @@ struct machdep_calls {
	int		(*system_reset_exception)(struct pt_regs *regs);
	int 		(*machine_check_exception)(struct pt_regs *regs);

	/* Called during machine check exception to retrive fixup address. */
	bool		(*mce_check_early_recovery)(struct pt_regs *regs);

	/* Motherboard/chipset features. This is a kind of general purpose
	 * hook used to control some machine specific features (like reset
	 * lines, chip power control, etc...).
+2 −1
Original line number Diff line number Diff line
@@ -187,7 +187,8 @@ struct mce_error_info {
#define MCE_EVENT_DONTRELEASE	false

extern void save_mce_event(struct pt_regs *regs, long handled,
			   struct mce_error_info *mce_err, uint64_t addr);
			   struct mce_error_info *mce_err, uint64_t nip,
			   uint64_t addr);
extern int get_mce_event(struct machine_check_event *mce, bool release);
extern void release_mce_event(void);
extern void machine_check_queue_event(void);
+3 −0
Original line number Diff line number Diff line
@@ -833,6 +833,8 @@ int64_t opal_sync_host_reboot(void);

/* Internal functions */
extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
extern int early_init_dt_scan_recoverable_ranges(unsigned long node,
				 const char *uname, int depth, void *data);

extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
@@ -863,6 +865,7 @@ extern void opal_nvram_init(void);
extern void opal_flash_init(void);

extern int opal_machine_check(struct pt_regs *regs);
extern bool opal_mce_check_early_recovery(struct pt_regs *regs);

extern void opal_shutdown(void);

+2 −2
Original line number Diff line number Diff line
@@ -70,7 +70,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
 */
void save_mce_event(struct pt_regs *regs, long handled,
		    struct mce_error_info *mce_err,
		    uint64_t addr)
		    uint64_t nip, uint64_t addr)
{
	uint64_t srr1;
	int index = __get_cpu_var(mce_nest_count)++;
@@ -86,7 +86,7 @@ void save_mce_event(struct pt_regs *regs, long handled,

	/* Populate generic machine check info */
	mce->version = MCE_V1;
	mce->srr0 = regs->nip;
	mce->srr0 = nip;
	mce->srr1 = regs->msr;
	mce->gpr3 = regs->gpr[3];
	mce->in_use = 1;
+33 −4
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include <linux/ptrace.h>
#include <asm/mmu.h>
#include <asm/mce.h>
#include <asm/machdep.h>

/* flush SLBs and reload */
static void flush_and_reload_slb(void)
@@ -197,13 +198,32 @@ static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
	}
}

static long mce_handle_ue_error(struct pt_regs *regs)
{
	long handled = 0;

	/*
	 * On specific SCOM read via MMIO we may get a machine check
	 * exception with SRR0 pointing inside opal. If that is the
	 * case OPAL may have recovery address to re-read SCOM data in
	 * different way and hence we can recover from this MC.
	 */

	if (ppc_md.mce_check_early_recovery) {
		if (ppc_md.mce_check_early_recovery(regs))
			handled = 1;
	}
	return handled;
}

long __machine_check_early_realmode_p7(struct pt_regs *regs)
{
	uint64_t srr1, addr;
	uint64_t srr1, nip, addr;
	long handled = 1;
	struct mce_error_info mce_error_info = { 0 };

	srr1 = regs->msr;
	nip = regs->nip;

	/*
	 * Handle memory errors depending whether this was a load/store or
@@ -221,7 +241,11 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
		addr = regs->nip;
	}

	save_mce_event(regs, handled, &mce_error_info, addr);
	/* Handle UE error. */
	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
		handled = mce_handle_ue_error(regs);

	save_mce_event(regs, handled, &mce_error_info, nip, addr);
	return handled;
}

@@ -263,11 +287,12 @@ static long mce_handle_derror_p8(uint64_t dsisr)

long __machine_check_early_realmode_p8(struct pt_regs *regs)
{
	uint64_t srr1, addr;
	uint64_t srr1, nip, addr;
	long handled = 1;
	struct mce_error_info mce_error_info = { 0 };

	srr1 = regs->msr;
	nip = regs->nip;

	if (P7_SRR1_MC_LOADSTORE(srr1)) {
		handled = mce_handle_derror_p8(regs->dsisr);
@@ -279,6 +304,10 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs)
		addr = regs->nip;
	}

	save_mce_event(regs, handled, &mce_error_info, addr);
	/* Handle UE error. */
	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
		handled = mce_handle_ue_error(regs);

	save_mce_event(regs, handled, &mce_error_info, nip, addr);
	return handled;
}
Loading