kaiser: merged update (8f0baadf) · Commits · e / devices / android_kernel_fairphone_FP3

arch/x86/entry/entry_64.S

+92 −13

Original line number	Diff line number	Diff line
		@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
		movq RIP(%rsp), %rcx
		movq EFLAGS(%rsp), %r11
		RESTORE_C_REGS_EXCEPT_RCX_R11
		/*
		* This opens a window where we have a user CR3, but are
		* running in the kernel. This makes using the CS
		* register useless for telling whether or not we need to
		* switch CR3 in NMIs. Normal interrupts are OK because
		* they are off here.
		*/
		SWITCH_USER_CR3
		movq RSP(%rsp), %rsp
		USERGS_SYSRET64
		@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
		syscall_return_via_sysret:
		/* rcx and r11 are already restored (see code above) */
		RESTORE_C_REGS_EXCEPT_RCX_R11
		/*
		* This opens a window where we have a user CR3, but are
		* running in the kernel. This makes using the CS
		* register useless for telling whether or not we need to
		* switch CR3 in NMIs. Normal interrupts are OK because
		* they are off here.
		*/
		SWITCH_USER_CR3
		movq RSP(%rsp), %rsp
		USERGS_SYSRET64

		opportunistic_sysret_failed:
		/*
		* This opens a window where we have a user CR3, but are
		* running in the kernel. This makes using the CS
		* register useless for telling whether or not we need to
		* switch CR3 in NMIs. Normal interrupts are OK because
		* they are off here.
		*/
		SWITCH_USER_CR3
		SWAPGS
		jmp restore_c_regs_and_iret
		@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
		cld
		SAVE_C_REGS 8
		SAVE_EXTRA_REGS 8
		/*
		* error_entry() always returns with a kernel gsbase and
		* CR3. We must also have a kernel CR3/gsbase before
		* calling TRACE_IRQS_*. Just unconditionally switch to
		* the kernel CR3 here.
		*/
		SWITCH_KERNEL_CR3
		xorl %ebx, %ebx
		testb $3, CS+8(%rsp)
		jz .Lerror_kernelspace
		@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
		* from user mode due to an IRET fault.
		*/
		SWAPGS
		SWITCH_KERNEL_CR3

		.Lerror_entry_from_usermode_after_swapgs:
		/*
		@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
		* Switch to kernel gsbase:
		*/
		SWAPGS
		SWITCH_KERNEL_CR3

		/*
		* Pretend that the exception came from user mode: set up pt_regs
		@@ -1249,7 +1275,10 @@ ENTRY(nmi)
		*/

		SWAPGS_UNSAFE_STACK
		SWITCH_KERNEL_CR3_NO_STACK
		/*
		* percpu variables are mapped with user CR3, so no need
		* to switch CR3 here.
		*/
		cld
		movq %rsp, %rdx
		movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
		@@ -1283,14 +1312,33 @@ ENTRY(nmi)

		movq %rsp, %rdi
		movq $-1, %rsi
		#ifdef CONFIG_KAISER
		/* Unconditionally use kernel CR3 for do_nmi() */
		/* %rax is saved above, so OK to clobber here */
		movq %cr3, %rax
		pushq %rax
		#ifdef CONFIG_KAISER_REAL_SWITCH
		andq $(~0x1000), %rax
		#endif
		movq %rax, %cr3
		#endif
		call do_nmi
		/*
		* Unconditionally restore CR3. I know we return to
		* kernel code that needs user CR3, but do we ever return
		* to "user mode" where we need the kernel CR3?
		*/
		#ifdef CONFIG_KAISER
		popq %rax
		mov %rax, %cr3
		#endif

		/*
		* Return back to user mode. We must not do the normal exit
		* work, because we don't want to enable interrupts. Fortunately,
		* do_nmi doesn't modify pt_regs.
		* work, because we don't want to enable interrupts. Do not
		* switch to user CR3: we might be going back to kernel code
		* that had a user CR3 set.
		*/
		SWITCH_USER_CR3
		SWAPGS
		jmp restore_c_regs_and_iret

		@@ -1486,23 +1534,54 @@ end_repeat_nmi:
		ALLOC_PT_GPREGS_ON_STACK

		/*
		* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
		* as we should not be calling schedule in NMI context.
		* Even with normal interrupts enabled. An NMI should not be
		* setting NEED_RESCHED or anything that normal interrupts and
		* exceptions might do.
		* Use the same approach as paranoid_entry to handle SWAPGS, but
		* without CR3 handling since we do that differently in NMIs. No
		* need to use paranoid_exit as we should not be calling schedule
		* in NMI context. Even with normal interrupts enabled. An NMI
		* should not be setting NEED_RESCHED or anything that normal
		* interrupts and exceptions might do.
		*/
		call paranoid_entry
		cld
		SAVE_C_REGS
		SAVE_EXTRA_REGS
		movl $1, %ebx
		movl $MSR_GS_BASE, %ecx
		rdmsr
		testl %edx, %edx
		js 1f /* negative -> in kernel */
		SWAPGS
		xorl %ebx, %ebx
		1:
		#ifdef CONFIG_KAISER
		/* Unconditionally use kernel CR3 for do_nmi() */
		/* %rax is saved above, so OK to clobber here */
		movq %cr3, %rax
		pushq %rax
		#ifdef CONFIG_KAISER_REAL_SWITCH
		andq $(~0x1000), %rax
		#endif
		movq %rax, %cr3
		#endif

		/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
		movq %rsp, %rdi
		addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
		movq $-1, %rsi
		call do_nmi
		/*
		* Unconditionally restore CR3. We might be returning to
		* kernel code that needs user CR3, like just just before
		* a sysret.
		*/
		#ifdef CONFIG_KAISER
		popq %rax
		mov %rax, %cr3
		#endif

		testl %ebx, %ebx /* swapgs needed? */
		jnz nmi_restore
		nmi_swapgs:
		SWITCH_USER_CR3_NO_STACK
		/* We fixed up CR3 above, so no need to switch it here */
		SWAPGS_UNSAFE_STACK
		nmi_restore:
		RESTORE_EXTRA_REGS

arch/x86/include/asm/kaiser.h

+26 −17

Original line number	Diff line number	Diff line
		@@ -16,13 +16,17 @@

		.macro _SWITCH_TO_KERNEL_CR3 reg
		movq %cr3, \reg
		#ifdef CONFIG_KAISER_REAL_SWITCH
		andq $(~0x1000), \reg
		#endif
		movq \reg, %cr3
		.endm

		.macro _SWITCH_TO_USER_CR3 reg
		movq %cr3, \reg
		#ifdef CONFIG_KAISER_REAL_SWITCH
		orq $(0x1000), \reg
		#endif
		movq \reg, %cr3
		.endm

		@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
		.endm

		#endif /* CONFIG_KAISER */

		#else /* __ASSEMBLY__ */


		#ifdef CONFIG_KAISER
		// Upon kernel/user mode switch, it may happen that
		// the address space has to be switched before the registers have been stored.
		// To change the address space, another register is needed.
		// A register therefore has to be stored/restored.
		//
		DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
		/*
		* Upon kernel/user mode switch, it may happen that the address
		* space has to be switched before the registers have been
		* stored. To change the address space, another register is
		* needed. A register therefore has to be stored/restored.
		*/

		#endif /* CONFIG_KAISER */
		DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

		/**
		* shadowmem_add_mapping - map a virtual memory part to the shadow mapping
		* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
		* @addr: the start address of the range
		* @size: the size of the range
		* @flags: The mapping flags of the pages
		*
		* the mapping is done on a global scope, so no bigger synchronization has to be done.
		* the pages have to be manually unmapped again when they are not needed any longer.
		* The mapping is done on a global scope, so no bigger
		* synchronization has to be done. the pages have to be
		* manually unmapped again when they are not needed any longer.
		*/
		extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
		extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


		/**
		* shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
		* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
		* @addr: the start address of the range
		* @size: the size of the range
		*/
		extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

		/**
		* shadowmem_initialize_mapping - Initalize the shadow mapping
		* kaiser_initialize_mapping - Initalize the shadow mapping
		*
		* most parts of the shadow mapping can be mapped upon boot time.
		* only the thread stacks have to be mapped on runtime.
		* the mapped regions are not unmapped at all.
		* Most parts of the shadow mapping can be mapped upon boot
		* time. Only per-process things like the thread stacks
		* or a new LDT have to be mapped at runtime. These boot-
		* time mappings are permanent and nevertunmapped.
		*/
		extern void kaiser_init(void);

		#endif
		#endif /* CONFIG_KAISER */

		#endif /* __ASSEMBLY */

arch/x86/include/asm/pgtable.h

+15 −3

Original line number	Diff line number	Diff line
		@@ -690,7 +690,17 @@ static inline pud_t pud_offset(pgd_t pgd, unsigned long address)

		static inline int pgd_bad(pgd_t pgd)
		{
		return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
		pgdval_t ignore_flags = _PAGE_USER;
		/*
		* We set NX on KAISER pgds that map userspace memory so
		* that userspace can not meaningfully use the kernel
		* page table by accident; it will fault on the first
		* instruction it tries to run. See native_set_pgd().
		*/
		if (IS_ENABLED(CONFIG_KAISER))
		ignore_flags \|= _PAGE_NX;

		return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
		}

		static inline int pgd_none(pgd_t pgd)
		@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t dst, pgd_t src, int count)
		{
		memcpy(dst, src, count * sizeof(pgd_t));
		#ifdef CONFIG_KAISER
		// clone the shadow pgd part as well
		memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
		/* Clone the shadow pgd part as well */
		memcpy(native_get_shadow_pgd(dst),
		native_get_shadow_pgd(src),
		count * sizeof(pgd_t));
		#endif
		}

arch/x86/include/asm/pgtable_64.h

+40 −8

Original line number	Diff line number	Diff line
		@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
		}

		#ifdef CONFIG_KAISER
		static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
		static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
		{
		return (pgd_t )(void)((unsigned long)(void*)pgdp \| (unsigned long)PAGE_SIZE);
		}

		static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
		static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
		{
		return (pgd_t )(void)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
		}
		#else
		static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
		{
		BUILD_BUG_ON(1);
		return NULL;
		}
		static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
		{
		return pgdp;
		}
		#endif /* CONFIG_KAISER */

		/*
		* Page table pages are page-aligned. The lower half of the top
		* level is used for userspace and the top half for the kernel.
		* This returns true for user pages that need to get copied into
		* both the user and kernel copies of the page tables, and false
		* for kernel pages that should only be in the kernel copy.
		*/
		static inline bool is_userspace_pgd(void *__ptr)
		{
		unsigned long ptr = (unsigned long)__ptr;

		return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
		}

		static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
		{
		#ifdef CONFIG_KAISER
		// We know that a pgd is page aligned.
		// Therefore the lower indices have to be mapped to user space.
		// These pages are mapped to the shadow mapping.
		if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
		pteval_t extra_kern_pgd_flags = 0;
		/* Do we need to also populate the shadow pgd? */
		if (is_userspace_pgd(pgdp)) {
		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
		/*
		* Even if the entry is mapping userspace, ensure
		* that userspace can not use it. This way, if we
		* get out to userspace running on the kernel CR3,
		* userspace will crash instead of running.
		*/
		extra_kern_pgd_flags = _PAGE_NX;
		}

		pgdp->pgd = pgd.pgd & ~_PAGE_USER;
		pgdp->pgd = pgd.pgd;
		pgdp->pgd \|= extra_kern_pgd_flags;
		#else /* CONFIG_KAISER */
		*pgdp = pgd;
		#endif

arch/x86/include/asm/pgtable_types.h

+1 −5

Original line number	Diff line number	Diff line
		@@ -123,11 +123,7 @@
		#define _PAGE_DEVMAP (_AT(pteval_t, 0))
		#endif

		#ifdef CONFIG_KAISER
		#define _PAGE_PROTNONE (_AT(pteval_t, 0))
		#else
		#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
		#endif

		#define _PAGE_TABLE (_PAGE_PRESENT \| _PAGE_RW \| _PAGE_USER \| \
		_PAGE_ACCESSED \| _PAGE_DIRTY)