Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5aa90a84 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 page table isolation updates from Thomas Gleixner:
 "This is the final set of enabling page table isolation on x86:

   - Infrastructure patches for handling the extra page tables.

   - Patches which map the various bits and pieces which are required to
     get in and out of user space into the user space visible page
     tables.

   - The required changes to have CR3 switching in the entry/exit code.

   - Optimizations for the CR3 switching along with documentation how
     the ASID/PCID mechanism works.

   - Updates to dump pagetables to cover the user space page tables for
     W+X scans and extra debugfs files to analyze both the kernel and
     the user space visible page tables

  The whole functionality is compile time controlled via a config switch
  and can be turned on/off on the command line as well"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  ...
parents 61233580 9f5cb6b3
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -2708,6 +2708,8 @@
			steal time is computed, but won't influence scheduler
			behaviour

	nopti		[X86-64] Disable kernel page table isolation

	nolapic		[X86-32,APIC] Do not enable or use the local APIC.

	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
@@ -3282,6 +3284,12 @@
	pt.		[PARIDE]
			See Documentation/blockdev/paride.txt.

	pti=		[X86_64]
			Control user/kernel address space isolation:
			on - enable
			off - disable
			auto - default setting

	pty.legacy_count=
			[KNL] Number of legacy pty's. Overwrites compiled-in
			default number.
+3 −2
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
+3 −0
Original line number Diff line number Diff line
@@ -23,6 +23,9 @@
 */
#undef CONFIG_AMD_MEM_ENCRYPT

/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION

#include "misc.h"

/* These actually do the work of building the kernel identity maps. */
+145 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>

/*

@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm

#ifdef CONFIG_PAGE_TABLE_ISOLATION

/*
 * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
 * halves:
 */
#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))

.macro SET_NOFLUSH_BIT	reg:req
	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm

.macro ADJUST_KERNEL_CR3 reg:req
	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
	andq    $(~PTI_SWITCH_MASK), \reg
.endm

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
	mov	%cr3, \scratch_reg
	ADJUST_KERNEL_CR3 \scratch_reg
	mov	\scratch_reg, %cr3
.Lend_\@:
.endm

#define THIS_CPU_user_pcid_flush_mask   \
	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask

.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
	mov	%cr3, \scratch_reg

	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

	/*
	 * Test if the ASID needs a flush.
	 */
	movq	\scratch_reg, \scratch_reg2
	andq	$(0x7FF), \scratch_reg		/* mask ASID */
	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jnc	.Lnoflush_\@

	/* Flush needed, clear the bit */
	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	movq	\scratch_reg2, \scratch_reg
	jmp	.Lwrcr3_\@

.Lnoflush_\@:
	movq	\scratch_reg2, \scratch_reg
	SET_NOFLUSH_BIT \scratch_reg

.Lwrcr3_\@:
	/* Flip the PGD and ASID to the user version */
	orq     $(PTI_SWITCH_MASK), \scratch_reg
	mov	\scratch_reg, %cr3
.Lend_\@:
.endm

.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
	pushq	%rax
	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
	popq	%rax
.endm

.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
	movq	%cr3, \scratch_reg
	movq	\scratch_reg, \save_reg
	/*
	 * Is the "switch mask" all zero?  That means that both of
	 * these are zero:
	 *
	 *	1. The user/kernel PCID bit, and
	 *	2. The user/kernel "bit" that points CR3 to the
	 *	   bottom half of the 8k PGD
	 *
	 * That indicates a kernel CR3 value, not a user CR3.
	 */
	testq	$(PTI_SWITCH_MASK), \scratch_reg
	jz	.Ldone_\@

	ADJUST_KERNEL_CR3 \scratch_reg
	movq	\scratch_reg, %cr3

.Ldone_\@:
.endm

.macro RESTORE_CR3 scratch_reg:req save_reg:req
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI

	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

	/*
	 * KERNEL pages can always resume with NOFLUSH as we do
	 * explicit flushes.
	 */
	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
	jnc	.Lnoflush_\@

	/*
	 * Check if there's a pending flush for the user ASID we're
	 * about to set.
	 */
	movq	\save_reg, \scratch_reg
	andq	$(0x7FF), \scratch_reg
	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jnc	.Lnoflush_\@

	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jmp	.Lwrcr3_\@

.Lnoflush_\@:
	SET_NOFLUSH_BIT \save_reg

.Lwrcr3_\@:
	/*
	 * The CR3 write could be avoided when not changing its value,
	 * but would require a CR3 read *and* a scratch register.
	 */
	movq	\save_reg, %cr3
.Lend_\@:
.endm

#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
.endm

#endif

#endif /* CONFIG_X86_64 */

/*
+41 −7
Original line number Diff line number Diff line
@@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>

#include "calling.h"

.code64
.section .entry.text, "ax"

@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
	/* Stash the user RSP. */
	movq	%rsp, RSP_SCRATCH

	/* Note: using %rsp as a scratch reg. */
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp

	/* Load the top of the task stack into RSP */
	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp

@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
	 */

	swapgs
	/*
	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
	 * is not required to switch CR3.
	 */
	movq	%rsp, PER_CPU_VAR(rsp_scratch)
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

@@ -403,6 +411,7 @@ syscall_return_via_sysret:
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

	popq	%rdi
	popq	%rsp
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
	 * We can do future final exit work right here.
	 */

	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

	/* Restore RDI. */
	popq	%rdi
	SWAPGS
@@ -822,7 +833,9 @@ native_irq_return_ldt:
	 */

	pushq	%rdi				/* Stash user RDI */
	SWAPGS
	SWAPGS					/* to kernel GS */
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */

	movq	PER_CPU_VAR(espfix_waddr), %rdi
	movq	%rax, (0*8)(%rdi)		/* user RAX */
	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -838,7 +851,6 @@ native_irq_return_ldt:
	/* Now RAX == RSP. */

	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
	popq	%rdi				/* Restore user RDI */

	/*
	 * espfix_stack[31:16] == 0.  The page tables are set up such that
@@ -849,7 +861,11 @@ native_irq_return_ldt:
	 * still points to an RO alias of the ESPFIX stack.
	 */
	orq	PER_CPU_VAR(espfix_stack), %rax
	SWAPGS

	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
	SWAPGS					/* to user GS */
	popq	%rdi				/* Restore user RDI */

	movq	%rax, %rsp
	UNWIND_HINT_IRET_REGS offset=8

@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
	UNWIND_HINT_FUNC

	pushq	%rdi
	/* Need to switch before accessing the thread stack. */
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
	movq	%rsp, %rdi
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
	js	1f				/* negative -> in kernel */
	SWAPGS
	xorl	%ebx, %ebx
1:	ret

1:
	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14

	ret
END(paranoid_entry)

/*
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
	testl	%ebx, %ebx			/* swapgs needed? */
	jnz	.Lparanoid_exit_no_swapgs
	TRACE_IRQS_IRETQ
	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
	SWAPGS_UNSAFE_STACK
	jmp	.Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
	 * from user mode due to an IRET fault.
	 */
	SWAPGS
	/* We have user CR3.  Change to kernel CR3. */
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

.Lerror_entry_from_usermode_after_swapgs:
	/* Put us onto the real thread stack. */
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
	 * .Lgs_change's error handler with kernel gsbase.
	 */
	SWAPGS
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
	jmp .Lerror_entry_done

.Lbstep_iret:
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)

.Lerror_bad_iret:
	/*
	 * We came from an IRET to user mode, so we have user gsbase.
	 * Switch to kernel gsbase:
	 * We came from an IRET to user mode, so we have user
	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
	 */
	SWAPGS
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

	/*
	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1389,6 +1416,10 @@ END(error_exit)
/*
 * Runs on exception stack.  Xen PV does not go through this path at all,
 * so we can use real assembly here.
 *
 * Registers:
 *	%r14: Used to save/restore the CR3 of the interrupted context
 *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
 */
ENTRY(nmi)
	UNWIND_HINT_IRET_REGS
@@ -1452,6 +1483,7 @@ ENTRY(nmi)

	swapgs
	cld
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
	movq	%rsp, %rdx
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
	movq	$-1, %rsi
	call	do_nmi

	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14

	testl	%ebx, %ebx			/* swapgs needed? */
	jnz	nmi_restore
nmi_swapgs:
Loading