Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a43c1590 authored by Mahesh Salgaonkar's avatar Mahesh Salgaonkar Committed by Michael Ellerman
Browse files

powerpc/pseries: Flush SLB contents on SLB MCE errors.



On pseries, as of today system crashes if we get a machine check
exceptions due to SLB errors. These are soft errors and can be fixed
by flushing the SLBs so the kernel can continue to function instead of
system crash. We do this in real mode before turning on MMU. Otherwise
we would run into nested machine checks. This patch now fetches the
rtas error log in real mode and flushes the SLBs on SLB/ERAT errors.

Signed-off-by: default avatarMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: default avatarMichal Suchanek <msuchanek@suse.com>
Reviewed-by: default avatarNicholas Piggin <npiggin@gmail.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 04fce21c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -108,6 +108,7 @@ struct machdep_calls {

	/* Early exception handlers called in realmode */
	int		(*hmi_exception_early)(struct pt_regs *regs);
	long		(*machine_check_early)(struct pt_regs *regs);

	/* Called during machine check exception to retrive fixup address. */
	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
+3 −0
Original line number Diff line number Diff line
@@ -210,4 +210,7 @@ extern void release_mce_event(void);
extern void machine_check_queue_event(void);
extern void machine_check_print_event_info(struct machine_check_event *evt,
					   bool user_mode);
#ifdef CONFIG_PPC_BOOK3S_64
void flush_and_reload_slb(void);
#endif /* CONFIG_PPC_BOOK3S_64 */
#endif /* __ASM_PPC64_MCE_H__ */
+129 −0
Original line number Diff line number Diff line
@@ -331,6 +331,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
machine_check_fwnmi:
	SET_SCRATCH0(r13)		/* save r13 */
	EXCEPTION_PROLOG_0(PACA_EXMC)
BEGIN_FTR_SECTION
	b	machine_check_pSeries_early
END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
machine_check_pSeries_0:
	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
	/*
@@ -342,6 +345,103 @@ machine_check_pSeries_0:

TRAMP_KVM_SKIP(PACA_EXMC, 0x200)

TRAMP_REAL_BEGIN(machine_check_pSeries_early)
BEGIN_FTR_SECTION
	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
	mr	r10,r1			/* Save r1 */
	lhz	r11,PACA_IN_MCE(r13)
	cmpwi	r11,0			/* Are we in nested machine check */
	bne	0f			/* Yes, we are. */
	/* First machine check entry */
	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
	addi	r11,r11,1		/* increment paca->in_mce */
	sth	r11,PACA_IN_MCE(r13)
	/* Limit nested MCE to level 4 to avoid stack overflow */
	cmpwi	r11,MAX_MCE_DEPTH
	bgt	1f			/* Check if we hit limit of 4 */
	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
	mfspr	r12,SPRN_SRR1		/* Save SRR1 */
	EXCEPTION_PROLOG_COMMON_1()
	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
	EXCEPTION_PROLOG_COMMON_3(0x200)
	addi	r3,r1,STACK_FRAME_OVERHEAD
	BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
	ld	r12,_MSR(r1)
	andi.	r11,r12,MSR_PR		/* See if coming from user. */
	bne	2f			/* continue in V mode if we are. */

	/*
	 * At this point we are not sure about what context we come from.
	 * We may be in the middle of switching stack. r1 may not be valid.
	 * Hence stay on emergency stack, call machine_check_exception and
	 * return from the interrupt.
	 * But before that, check if this is an un-recoverable exception.
	 * If yes, then stay on emergency stack and panic.
	 */
	andi.	r11,r12,MSR_RI
	beq	1f

	/*
	 * Check if we have successfully handled/recovered from error, if not
	 * then stay on emergency stack and panic.
	 */
	cmpdi	r3,0		/* see if we handled MCE successfully */
	beq	1f		/* if !handled then panic */

	/* Stay on emergency stack and return from interrupt. */
	LOAD_HANDLER(r10,mce_return)
	mtspr	SPRN_SRR0,r10
	ld	r10,PACAKMSR(r13)
	mtspr	SPRN_SRR1,r10
	RFI_TO_KERNEL
	b	.

1:	LOAD_HANDLER(r10,unrecover_mce)
	mtspr	SPRN_SRR0,r10
	ld	r10,PACAKMSR(r13)
	/*
	 * We are going down. But there are chances that we might get hit by
	 * another MCE during panic path and we may run into unstable state
	 * with no way out. Hence, turn ME bit off while going down, so that
	 * when another MCE is hit during panic path, hypervisor will
	 * power cycle the lpar, instead of getting into MCE loop.
	 */
	li	r3,MSR_ME
	andc	r10,r10,r3		/* Turn off MSR_ME */
	mtspr	SPRN_SRR1,r10
	RFI_TO_KERNEL
	b	.

	/* Move original SRR0 and SRR1 into the respective regs */
2:	ld	r9,_MSR(r1)
	mtspr	SPRN_SRR1,r9
	ld	r3,_NIP(r1)
	mtspr	SPRN_SRR0,r3
	ld	r9,_CTR(r1)
	mtctr	r9
	ld	r9,_XER(r1)
	mtxer	r9
	ld	r9,_LINK(r1)
	mtlr	r9
	REST_GPR(0, r1)
	REST_8GPRS(2, r1)
	REST_GPR(10, r1)
	ld	r11,_CCR(r1)
	mtcr	r11
	/* Decrement paca->in_mce. */
	lhz	r12,PACA_IN_MCE(r13)
	subi	r12,r12,1
	sth	r12,PACA_IN_MCE(r13)
	REST_GPR(11, r1)
	REST_2GPRS(12, r1)
	/* restore original r1. */
	ld	r1,GPR1(r1)
	SET_SCRATCH0(r13)		/* save r13 */
	EXCEPTION_PROLOG_0(PACA_EXMC)
	b	machine_check_pSeries_0
END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)

EXC_COMMON_BEGIN(machine_check_common)
	/*
	 * Machine check is different because we use a different
@@ -535,6 +635,35 @@ EXC_COMMON_BEGIN(unrecover_mce)
	bl	unrecoverable_exception
	b	1b

EXC_COMMON_BEGIN(mce_return)
	/* Invoke machine_check_exception to print MCE event and return. */
	addi	r3,r1,STACK_FRAME_OVERHEAD
	bl	machine_check_exception
	ld	r9,_MSR(r1)
	mtspr	SPRN_SRR1,r9
	ld	r3,_NIP(r1)
	mtspr	SPRN_SRR0,r3
	ld	r9,_CTR(r1)
	mtctr	r9
	ld	r9,_XER(r1)
	mtxer	r9
	ld	r9,_LINK(r1)
	mtlr	r9
	REST_GPR(0, r1)
	REST_8GPRS(2, r1)
	REST_GPR(10, r1)
	ld	r11,_CCR(r1)
	mtcr	r11
	/* Decrement paca->in_mce. */
	lhz	r12,PACA_IN_MCE(r13)
	subi	r12,r12,1
	sth	r12,PACA_IN_MCE(r13)
	REST_GPR(11, r1)
	REST_2GPRS(12, r1)
	/* restore original r1. */
	ld	r1,GPR1(r1)
	RFI_TO_KERNEL
	b	.

EXC_REAL(data_access, 0x300, 0x80)
EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
+5 −4
Original line number Diff line number Diff line
@@ -488,10 +488,11 @@ long machine_check_early(struct pt_regs *regs)
{
	long handled = 0;

	__this_cpu_inc(irq_stat.mce_exceptions);

	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
		handled = cur_cpu_spec->machine_check_early(regs);
	/*
	 * See if platform is capable of handling machine check.
	 */
	if (ppc_md.machine_check_early)
		handled = ppc_md.machine_check_early(regs);
	return handled;
}

+1 −1
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@ static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)

/* flush SLBs and reload */
#ifdef CONFIG_PPC_BOOK3S_64
static void flush_and_reload_slb(void)
void flush_and_reload_slb(void)
{
	/* Invalidate all SLBs */
	slb_flush_all_realmode();
Loading