Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8f0baadf authored by Dave Hansen's avatar Dave Hansen Committed by Greg Kroah-Hartman
Browse files

kaiser: merged update




Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging).

Signed-off-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 13be4483
Loading
Loading
Loading
Loading
+92 −13
Original line number Diff line number Diff line
@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
	movq	RIP(%rsp), %rcx
	movq	EFLAGS(%rsp), %r11
	RESTORE_C_REGS_EXCEPT_RCX_R11
	/*
	 * This opens a window where we have a user CR3, but are
	 * running in the kernel.  This makes using the CS
	 * register useless for telling whether or not we need to
	 * switch CR3 in NMIs.  Normal interrupts are OK because
	 * they are off here.
	 */
	SWITCH_USER_CR3
	movq	RSP(%rsp), %rsp
	USERGS_SYSRET64
@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
	/* rcx and r11 are already restored (see code above) */
	RESTORE_C_REGS_EXCEPT_RCX_R11
	/*
	 * This opens a window where we have a user CR3, but are
	 * running in the kernel.  This makes using the CS
	 * register useless for telling whether or not we need to
	 * switch CR3 in NMIs.  Normal interrupts are OK because
	 * they are off here.
	 */
	SWITCH_USER_CR3
	movq	RSP(%rsp), %rsp
	USERGS_SYSRET64

opportunistic_sysret_failed:
	/*
	 * This opens a window where we have a user CR3, but are
	 * running in the kernel.  This makes using the CS
	 * register useless for telling whether or not we need to
	 * switch CR3 in NMIs.  Normal interrupts are OK because
	 * they are off here.
	 */
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_c_regs_and_iret
@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
	cld
	SAVE_C_REGS 8
	SAVE_EXTRA_REGS 8
	/*
	 * error_entry() always returns with a kernel gsbase and
	 * CR3.  We must also have a kernel CR3/gsbase before
	 * calling TRACE_IRQS_*.  Just unconditionally switch to
	 * the kernel CR3 here.
	 */
	SWITCH_KERNEL_CR3
	xorl	%ebx, %ebx
	testb	$3, CS+8(%rsp)
	jz	.Lerror_kernelspace
@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
	 * from user mode due to an IRET fault.
	 */
	SWAPGS
	SWITCH_KERNEL_CR3

.Lerror_entry_from_usermode_after_swapgs:
	/*
@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
	 * Switch to kernel gsbase:
	 */
	SWAPGS
	SWITCH_KERNEL_CR3

	/*
	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1249,7 +1275,10 @@ ENTRY(nmi)
	 */

	SWAPGS_UNSAFE_STACK
	SWITCH_KERNEL_CR3_NO_STACK
	/*
	 * percpu variables are mapped with user CR3, so no need
	 * to switch CR3 here.
	 */
	cld
	movq	%rsp, %rdx
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1283,14 +1312,33 @@ ENTRY(nmi)

	movq	%rsp, %rdi
	movq	$-1, %rsi
#ifdef CONFIG_KAISER
	/* Unconditionally use kernel CR3 for do_nmi() */
	/* %rax is saved above, so OK to clobber here */
	movq	%cr3, %rax
	pushq	%rax
#ifdef CONFIG_KAISER_REAL_SWITCH
	andq	$(~0x1000), %rax
#endif
	movq	%rax, %cr3
#endif
	call	do_nmi
	/*
	 * Unconditionally restore CR3.  I know we return to
	 * kernel code that needs user CR3, but do we ever return
	 * to "user mode" where we need the kernel CR3?
	 */
#ifdef CONFIG_KAISER
	popq	%rax
	mov	%rax, %cr3
#endif

	/*
	 * Return back to user mode.  We must *not* do the normal exit
	 * work, because we don't want to enable interrupts.  Fortunately,
	 * do_nmi doesn't modify pt_regs.
	 * work, because we don't want to enable interrupts.  Do not
	 * switch to user CR3: we might be going back to kernel code
	 * that had a user CR3 set.
	 */
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_c_regs_and_iret

@@ -1486,23 +1534,54 @@ end_repeat_nmi:
	ALLOC_PT_GPREGS_ON_STACK

	/*
	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
	 * as we should not be calling schedule in NMI context.
	 * Even with normal interrupts enabled. An NMI should not be
	 * setting NEED_RESCHED or anything that normal interrupts and
	 * exceptions might do.
	 * Use the same approach as paranoid_entry to handle SWAPGS, but
	 * without CR3 handling since we do that differently in NMIs.  No
	 * need to use paranoid_exit as we should not be calling schedule
	 * in NMI context.  Even with normal interrupts enabled. An NMI
	 * should not be setting NEED_RESCHED or anything that normal
	 * interrupts and exceptions might do.
	 */
	call	paranoid_entry
	cld
	SAVE_C_REGS
	SAVE_EXTRA_REGS
	movl	$1, %ebx
	movl	$MSR_GS_BASE, %ecx
	rdmsr
	testl	%edx, %edx
	js	1f				/* negative -> in kernel */
	SWAPGS
	xorl	%ebx, %ebx
1:
#ifdef CONFIG_KAISER
	/* Unconditionally use kernel CR3 for do_nmi() */
	/* %rax is saved above, so OK to clobber here */
	movq	%cr3, %rax
	pushq	%rax
#ifdef CONFIG_KAISER_REAL_SWITCH
	andq	$(~0x1000), %rax
#endif
	movq	%rax, %cr3
#endif

	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
	movq	%rsp, %rdi
	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
	movq	$-1, %rsi
	call	do_nmi
	/*
	 * Unconditionally restore CR3.  We might be returning to
	 * kernel code that needs user CR3, like just just before
	 * a sysret.
	 */
#ifdef CONFIG_KAISER
	popq	%rax
	mov	%rax, %cr3
#endif

	testl	%ebx, %ebx			/* swapgs needed? */
	jnz	nmi_restore
nmi_swapgs:
	SWITCH_USER_CR3_NO_STACK
	/* We fixed up CR3 above, so no need to switch it here */
	SWAPGS_UNSAFE_STACK
nmi_restore:
	RESTORE_EXTRA_REGS
+26 −17
Original line number Diff line number Diff line
@@ -16,13 +16,17 @@

.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), \reg
#endif
movq \reg, %cr3
.endm

.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(0x1000), \reg
#endif
movq \reg, %cr3
.endm

@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm

#endif /* CONFIG_KAISER */

#else /* __ASSEMBLY__ */


#ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that
// the address space has to be switched before the registers have been stored.
// To change the address space, another register is needed.
// A register therefore has to be stored/restored.
//
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
 * Upon kernel/user mode switch, it may happen that the address
 * space has to be switched before the registers have been
 * stored.  To change the address space, another register is
 * needed.  A register therefore has to be stored/restored.
*/

#endif /* CONFIG_KAISER */
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

/**
 *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
 *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
 *  @addr: the start address of the range
 *  @size: the size of the range
 *  @flags: The mapping flags of the pages
 *
 *  the mapping is done on a global scope, so no bigger synchronization has to be done.
 *  the pages have to be manually unmapped again when they are not needed any longer.
 *  The mapping is done on a global scope, so no bigger
 *  synchronization has to be done.  the pages have to be
 *  manually unmapped again when they are not needed any longer.
 */
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


/**
 *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
 *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
 *  @addr: the start address of the range
 *  @size: the size of the range
 */
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

/**
 *  shadowmem_initialize_mapping - Initalize the shadow mapping
 *  kaiser_initialize_mapping - Initalize the shadow mapping
 *
 *  most parts of the shadow mapping can be mapped upon boot time.
 *  only the thread stacks have to be mapped on runtime.
 *  the mapped regions are not unmapped at all.
 *  Most parts of the shadow mapping can be mapped upon boot
 *  time.  Only per-process things like the thread stacks
 *  or a new LDT have to be mapped at runtime.  These boot-
 *  time mappings are permanent and nevertunmapped.
 */
extern void kaiser_init(void);

#endif
#endif /* CONFIG_KAISER */

#endif /* __ASSEMBLY */



+15 −3
Original line number Diff line number Diff line
@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

static inline int pgd_bad(pgd_t pgd)
{
	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
	pgdval_t ignore_flags = _PAGE_USER;
	/*
	 * We set NX on KAISER pgds that map userspace memory so
	 * that userspace can not meaningfully use the kernel
	 * page table by accident; it will fault on the first
	 * instruction it tries to run.  See native_set_pgd().
	 */
	if (IS_ENABLED(CONFIG_KAISER))
		ignore_flags |= _PAGE_NX;

	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
       memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
	// clone the shadow pgd part as well
	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
	/* Clone the shadow pgd part as well */
	memcpy(native_get_shadow_pgd(dst),
	       native_get_shadow_pgd(src),
	       count * sizeof(pgd_t));
#endif
}

+40 −8
Original line number Diff line number Diff line
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
}

#ifdef CONFIG_KAISER
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
}

static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
}
#else
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
	BUILD_BUG_ON(1);
	return NULL;
}
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
	return pgdp;
}
#endif /* CONFIG_KAISER */

/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 * This returns true for user pages that need to get copied into
 * both the user and kernel copies of the page tables, and false
 * for kernel pages that should only be in the kernel copy.
 */
static inline bool is_userspace_pgd(void *__ptr)
{
	unsigned long ptr = (unsigned long)__ptr;

	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
	// We know that a pgd is page aligned.
	// Therefore the lower indices have to be mapped to user space.
	// These pages are mapped to the shadow mapping.
	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
	pteval_t extra_kern_pgd_flags = 0;
	/* Do we need to also populate the shadow pgd? */
	if (is_userspace_pgd(pgdp)) {
		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
		/*
		 * Even if the entry is *mapping* userspace, ensure
		 * that userspace can not use it.  This way, if we
		 * get out to userspace running on the kernel CR3,
		 * userspace will crash instead of running.
		 */
		extra_kern_pgd_flags = _PAGE_NX;
	}

	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
	pgdp->pgd = pgd.pgd;
	pgdp->pgd |= extra_kern_pgd_flags;
#else /* CONFIG_KAISER */
	*pgdp = pgd;
#endif
+1 −5
Original line number Diff line number Diff line
@@ -123,11 +123,7 @@
#define _PAGE_DEVMAP	(_AT(pteval_t, 0))
#endif

#ifdef CONFIG_KAISER
#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
#else
#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#endif

#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
			 _PAGE_ACCESSED | _PAGE_DIRTY)
Loading