Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 243e2511 authored by Benjamin Herrenschmidt's avatar Benjamin Herrenschmidt Committed by Michael Ellerman
Browse files

powerpc/xive: Native exploitation of the XIVE interrupt controller



The XIVE interrupt controller is the new interrupt controller
found in POWER9. It supports advanced virtualization capabilities
among other things.

Currently we use a set of firmware calls that simulate the old
"XICS" interrupt controller but this is fairly inefficient.

This adds the framework for using XIVE along with a native
backend which OPAL for configuration. Later, a backend allowing
the use in a KVM or PowerVM guest will also be provided.

This disables some fast path for interrupts in KVM when XIVE is
enabled as these rely on the firmware emulation code which is no
longer available when the XIVE is used natively by Linux.

A latter patch will make KVM also directly exploit the XIVE, thus
recovering the lost performance (and more).

Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Fixup pr_xxx("XIVE:"...), don't split pr_xxx() strings,
 tweak Kconfig so XIVE_NATIVE selects XIVE and depends on POWERNV,
 fix build errors when SMP=n, fold in fixes from Ben:
   Don't call cpu_online() on an invalid CPU number
   Fix irq target selection returning out of bounds cpu#
   Extra sanity checks on cpu numbers
 ]
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent a978e139
Loading
Loading
Loading
Loading
+97 −0
Original line number Original line Diff line number Diff line
/*
 * Copyright 2016,2017 IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#ifndef _ASM_POWERPC_XIVE_REGS_H
#define _ASM_POWERPC_XIVE_REGS_H

/*
 * Thread Management (aka "TM") registers
 */

/* TM register offsets */
#define TM_QW0_USER		0x000 /* All rings */
#define TM_QW1_OS		0x010 /* Ring 0..2 */
#define TM_QW2_HV_POOL		0x020 /* Ring 0..1 */
#define TM_QW3_HV_PHYS		0x030 /* Ring 0..1 */

/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
#define TM_NSR			0x0  /*  +   +   -   +  */
#define TM_CPPR			0x1  /*  -   +   -   +  */
#define TM_IPB			0x2  /*  -   +   +   +  */
#define TM_LSMFB		0x3  /*  -   +   +   +  */
#define TM_ACK_CNT		0x4  /*  -   +   -   -  */
#define TM_INC			0x5  /*  -   +   -   +  */
#define TM_AGE			0x6  /*  -   +   -   +  */
#define TM_PIPR			0x7  /*  -   +   -   +  */

#define TM_WORD0		0x0
#define TM_WORD1		0x4

/*
 * QW word 2 contains the valid bit at the top and other fields
 * depending on the QW.
 */
#define TM_WORD2		0x8
#define   TM_QW0W2_VU		PPC_BIT32(0)
#define   TM_QW0W2_LOGIC_SERV	PPC_BITMASK32(1,31) // XX 2,31 ?
#define   TM_QW1W2_VO		PPC_BIT32(0)
#define   TM_QW1W2_OS_CAM	PPC_BITMASK32(8,31)
#define   TM_QW2W2_VP		PPC_BIT32(0)
#define   TM_QW2W2_POOL_CAM	PPC_BITMASK32(8,31)
#define   TM_QW3W2_VT		PPC_BIT32(0)
#define   TM_QW3W2_LP		PPC_BIT32(6)
#define   TM_QW3W2_LE		PPC_BIT32(7)
#define   TM_QW3W2_T		PPC_BIT32(31)

/*
 * In addition to normal loads to "peek" and writes (only when invalid)
 * using 4 and 8 bytes accesses, the above registers support these
 * "special" byte operations:
 *
 *   - Byte load from QW0[NSR] - User level NSR (EBB)
 *   - Byte store to QW0[NSR] - User level NSR (EBB)
 *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
 *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
 *                                    otherwise VT||0000000
 *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
 *
 * Then we have all these "special" CI ops at these offset that trigger
 * all sorts of side effects:
 */
#define TM_SPC_ACK_EBB		0x800	/* Load8 ack EBB to reg*/
#define TM_SPC_ACK_OS_REG	0x810	/* Load16 ack OS irq to reg */
#define TM_SPC_PUSH_USR_CTX	0x808	/* Store32 Push/Validate user context */
#define TM_SPC_PULL_USR_CTX	0x808	/* Load32 Pull/Invalidate user context */
#define TM_SPC_SET_OS_PENDING	0x812	/* Store8 Set OS irq pending bit */
#define TM_SPC_PULL_OS_CTX	0x818	/* Load32/Load64 Pull/Invalidate OS context to reg */
#define TM_SPC_PULL_POOL_CTX	0x828	/* Load32/Load64 Pull/Invalidate Pool context to reg*/
#define TM_SPC_ACK_HV_REG	0x830	/* Load16 ack HV irq to reg */
#define TM_SPC_PULL_USR_CTX_OL	0xc08	/* Store8 Pull/Inval usr ctx to odd line */
#define TM_SPC_ACK_OS_EL	0xc10	/* Store8 ack OS irq to even line */
#define TM_SPC_ACK_HV_POOL_EL	0xc20	/* Store8 ack HV evt pool to even line */
#define TM_SPC_ACK_HV_EL	0xc30	/* Store8 ack HV irq to even line */
/* XXX more... */

/* NSR fields for the various QW ack types */
#define TM_QW0_NSR_EB		PPC_BIT8(0)
#define TM_QW1_NSR_EO		PPC_BIT8(0)
#define TM_QW3_NSR_HE		PPC_BITMASK8(0,1)
#define  TM_QW3_NSR_HE_NONE	0
#define  TM_QW3_NSR_HE_POOL	1
#define  TM_QW3_NSR_HE_PHYS	2
#define  TM_QW3_NSR_HE_LSI	3
#define TM_QW3_NSR_I		PPC_BIT8(2)
#define TM_QW3_NSR_GRP_LVL	PPC_BIT8(3,7)

/* Utilities to manipulate these (originaly from OPAL) */
#define MASK_TO_LSH(m)		(__builtin_ffsl(m) - 1)
#define GETFIELD(m, v)		(((v) & (m)) >> MASK_TO_LSH(m))
#define SETFIELD(m, v, val)				\
	(((v) & ~(m)) |	((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))

#endif /* _ASM_POWERPC_XIVE_REGS_H */
+163 −0
Original line number Original line Diff line number Diff line
/*
 * Copyright 2016,2017 IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#ifndef _ASM_POWERPC_XIVE_H
#define _ASM_POWERPC_XIVE_H

#define XIVE_INVALID_VP	0xffffffff

#ifdef CONFIG_PPC_XIVE

/*
 * Thread Interrupt Management Area (TIMA)
 *
 * This is a global MMIO region divided in 4 pages of varying access
 * permissions, providing access to per-cpu interrupt management
 * functions. It always identifies the CPU doing the access based
 * on the PowerBus initiator ID, thus we always access via the
 * same offset regardless of where the code is executing
 */
extern void __iomem *xive_tima;

/*
 * Offset in the TM area of our current execution level (provided by
 * the backend)
 */
extern u32 xive_tima_offset;

/*
 * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
 * have it stored in the xive_cpu structure. We also cache
 * for normal interrupts the current target CPU.
 *
 * This structure is setup by the backend for each interrupt.
 */
struct xive_irq_data {
	u64 flags;
	u64 eoi_page;
	void __iomem *eoi_mmio;
	u64 trig_page;
	void __iomem *trig_mmio;
	u32 esb_shift;
	int src_chip;

	/* Setup/used by frontend */
	int target;
	bool saved_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI	0x01
#define XIVE_IRQ_FLAG_LSI	0x02
#define XIVE_IRQ_FLAG_SHIFT_BUG	0x04
#define XIVE_IRQ_FLAG_MASK_FW	0x08
#define XIVE_IRQ_FLAG_EOI_FW	0x10

#define XIVE_INVALID_CHIP_ID	-1

/* A queue tracking structure in a CPU */
struct xive_q {
	__be32 			*qpage;
	u32			msk;
	u32			idx;
	u32			toggle;
	u64			eoi_phys;
	u32			esc_irq;
	atomic_t		count;
	atomic_t		pending_count;
};

/*
 * "magic" Event State Buffer (ESB) MMIO offsets.
 *
 * Each interrupt source has a 2-bit state machine called ESB
 * which can be controlled by MMIO. It's made of 2 bits, P and
 * Q. P indicates that an interrupt is pending (has been sent
 * to a queue and is waiting for an EOI). Q indicates that the
 * interrupt has been triggered while pending.
 *
 * This acts as a coalescing mechanism in order to guarantee
 * that a given interrupt only occurs at most once in a queue.
 *
 * When doing an EOI, the Q bit will indicate if the interrupt
 * needs to be re-triggered.
 *
 * The following offsets into the ESB MMIO allow to read or
 * manipulate the PQ bits. They must be used with an 8-bytes
 * load instruction. They all return the previous state of the
 * interrupt (atomically).
 *
 * Additionally, some ESB pages support doing an EOI via a
 * store at 0 and some ESBs support doing a trigger via a
 * separate trigger page.
 */
#define XIVE_ESB_GET		0x800
#define XIVE_ESB_SET_PQ_00	0xc00
#define XIVE_ESB_SET_PQ_01	0xd00
#define XIVE_ESB_SET_PQ_10	0xe00
#define XIVE_ESB_SET_PQ_11	0xf00
#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01

#define XIVE_ESB_VAL_P		0x2
#define XIVE_ESB_VAL_Q		0x1

/* Global enable flags for the XIVE support */
extern bool __xive_enabled;

static inline bool xive_enabled(void) { return __xive_enabled; }

extern bool xive_native_init(void);
extern void xive_smp_probe(void);
extern int  xive_smp_prepare_cpu(unsigned int cpu);
extern void xive_smp_setup_cpu(void);
extern void xive_smp_disable_cpu(void);
extern void xive_kexec_teardown_cpu(int secondary);
extern void xive_shutdown(void);
extern void xive_flush_interrupt(void);

/* xmon hook */
extern void xmon_xive_do_dump(int cpu);

/* APIs used by KVM */
extern u32 xive_native_default_eq_shift(void);
extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
extern void xive_native_free_vp_block(u32 vp_base);
extern int xive_native_populate_irq_data(u32 hw_irq,
					 struct xive_irq_data *data);
extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
extern u32 xive_native_alloc_irq(void);
extern void xive_native_free_irq(u32 irq);
extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);

extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
				       __be32 *qpage, u32 order, bool can_escalate);
extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);

extern bool __xive_irq_trigger(struct xive_irq_data *xd);
extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);

extern bool is_xive_irq(struct irq_chip *chip);

#else

static inline bool xive_enabled(void) { return false; }

static inline bool xive_native_init(void) { return false; }
static inline void xive_smp_probe(void) { }
extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
static inline void xive_smp_setup_cpu(void) { }
static inline void xive_smp_disable_cpu(void) { }
static inline void xive_kexec_teardown_cpu(int secondary) { }
static inline void xive_shutdown(void) { }
static inline void xive_flush_interrupt(void) { }

static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
static inline void xive_native_free_vp_block(u32 vp_base) { }

#endif

#endif /* _ASM_POWERPC_XIVE_H */
+2 −0
Original line number Original line Diff line number Diff line
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
extern int cpus_are_in_xmon(void);
extern int cpus_are_in_xmon(void);
#endif
#endif


extern void xmon_printf(const char *format, ...);

#endif /* __KERNEL __ */
#endif /* __KERNEL __ */
#endif /* __ASM_POWERPC_XMON_H */
#endif /* __ASM_POWERPC_XMON_H */
+8 −0
Original line number Original line Diff line number Diff line
@@ -23,6 +23,7 @@
#include <asm/kvm_book3s.h>
#include <asm/kvm_book3s.h>
#include <asm/archrandom.h>
#include <asm/archrandom.h>
#include <asm/xics.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/dbell.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
#include <asm/cputhreads.h>
#include <asm/io.h>
#include <asm/io.h>
@@ -224,6 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
		return;
		return;
	}
	}


	/* We should never reach this */
	if (WARN_ON_ONCE(xive_enabled()))
	    return;

	/* Else poke the target with an IPI */
	/* Else poke the target with an IPI */
	xics_phys = paca[cpu].kvm_hstate.xics_phys;
	xics_phys = paca[cpu].kvm_hstate.xics_phys;
	if (xics_phys)
	if (xics_phys)
@@ -386,6 +391,9 @@ long kvmppc_read_intr(void)
	long rc;
	long rc;
	bool again;
	bool again;


	if (xive_enabled())
		return 1;

	do {
	do {
		again = false;
		again = false;
		rc = kvmppc_read_one_intr(&again);
		rc = kvmppc_read_one_intr(&again);
+1 −0
Original line number Original line Diff line number Diff line
@@ -4,6 +4,7 @@ config PPC_POWERNV
	select PPC_NATIVE
	select PPC_NATIVE
	select PPC_XICS
	select PPC_XICS
	select PPC_ICP_NATIVE
	select PPC_ICP_NATIVE
	select PPC_XIVE_NATIVE
	select PPC_P7_NAP
	select PPC_P7_NAP
	select PCI
	select PCI
	select PCI_MSI
	select PCI_MSI
Loading