Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 13be4483 authored by Richard Fellner's avatar Richard Fellner Committed by Greg Kroah-Hartman
Browse files

KAISER: Kernel Address Isolation


This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.

More information about the patch can be found on:

        https://github.com/IAIK/KAISER

From: Richard Fellner <richard.fellner@student.tugraz.at>
From: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
Date: Thu, 4 May 2017 14:26:50 +0200
Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5

To: <linux-kernel@vger.kernel.org>
To: <kernel-hardening@lists.openwall.com>
Cc: <clementine.maurice@iaik.tugraz.at>
Cc: <moritz.lipp@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <kirill.shutemov@linux.intel.com>
Cc: <anders.fogh@gdata-adan.de>

After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).

With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.

If there are any questions we would love to answer them.
We also appreciate any comments!

Cheers,
Daniel (+ the KAISER team from Graz University of Technology)

[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf

[patch based also on
https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch

]

Signed-off-by: default avatarRichard Fellner <richard.fellner@student.tugraz.at>
Signed-off-by: default avatarMoritz Lipp <moritz.lipp@iaik.tugraz.at>
Signed-off-by: default avatarDaniel Gruss <daniel.gruss@iaik.tugraz.at>
Signed-off-by: default avatarMichael Schwarz <michael.schwarz@iaik.tugraz.at>
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent b5fd58e9
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/kaiser.h>
#include <linux/err.h>

/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
	 * it is too small to ever cause noticeable irq latency.
	 */
	SWAPGS_UNSAFE_STACK
	SWITCH_KERNEL_CR3_NO_STACK
	/*
	 * A hypervisor implementation might want to use a label
	 * after the swapgs, so that it can do the swapgs
@@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath:
	movq	RIP(%rsp), %rcx
	movq	EFLAGS(%rsp), %r11
	RESTORE_C_REGS_EXCEPT_RCX_R11
	SWITCH_USER_CR3
	movq	RSP(%rsp), %rsp
	USERGS_SYSRET64

@@ -323,10 +326,12 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
	/* rcx and r11 are already restored (see code above) */
	RESTORE_C_REGS_EXCEPT_RCX_R11
	SWITCH_USER_CR3
	movq	RSP(%rsp), %rsp
	USERGS_SYSRET64

opportunistic_sysret_failed:
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -424,6 +429,7 @@ ENTRY(ret_from_fork)
	movq	%rsp, %rdi
	call	syscall_return_slowpath	/* returns with IRQs disabled */
	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_regs_and_iret

@@ -478,6 +484,7 @@ END(irq_entries_start)
	 * tracking that we're in kernel mode.
	 */
	SWAPGS
	SWITCH_KERNEL_CR3

	/*
	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -535,6 +542,7 @@ GLOBAL(retint_user)
	mov	%rsp,%rdi
	call	prepare_exit_to_usermode
	TRACE_IRQS_IRETQ
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_regs_and_iret

@@ -612,6 +620,7 @@ native_irq_return_ldt:

	pushq	%rdi				/* Stash user RDI */
	SWAPGS
	SWITCH_KERNEL_CR3
	movq	PER_CPU_VAR(espfix_waddr), %rdi
	movq	%rax, (0*8)(%rdi)		/* user RAX */
	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -638,6 +647,7 @@ native_irq_return_ldt:
	 * still points to an RO alias of the ESPFIX stack.
	 */
	orq	PER_CPU_VAR(espfix_stack), %rax
	SWITCH_USER_CR3
	SWAPGS
	movq	%rax, %rsp

@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
	testl	%edx, %edx
	js	1f				/* negative -> in kernel */
	SWAPGS
	SWITCH_KERNEL_CR3
	xorl	%ebx, %ebx
1:	ret
END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
	testl	%ebx, %ebx			/* swapgs needed? */
	jnz	paranoid_exit_no_swapgs
	TRACE_IRQS_IRETQ
	SWITCH_USER_CR3_NO_STACK
	SWAPGS_UNSAFE_STACK
	jmp	paranoid_exit_restore
paranoid_exit_no_swapgs:
@@ -1084,6 +1096,7 @@ ENTRY(error_entry)
	 * from user mode due to an IRET fault.
	 */
	SWAPGS
	SWITCH_KERNEL_CR3

.Lerror_entry_from_usermode_after_swapgs:
	/*
@@ -1135,6 +1148,7 @@ ENTRY(error_entry)
	 * Switch to kernel gsbase:
	 */
	SWAPGS
	SWITCH_KERNEL_CR3

	/*
	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1235,6 +1249,7 @@ ENTRY(nmi)
	 */

	SWAPGS_UNSAFE_STACK
	SWITCH_KERNEL_CR3_NO_STACK
	cld
	movq	%rsp, %rdx
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1275,6 +1290,7 @@ ENTRY(nmi)
	 * work, because we don't want to enable interrupts.  Fortunately,
	 * do_nmi doesn't modify pt_regs.
	 */
	SWITCH_USER_CR3
	SWAPGS
	jmp	restore_c_regs_and_iret

@@ -1486,6 +1502,7 @@ end_repeat_nmi:
	testl	%ebx, %ebx			/* swapgs needed? */
	jnz	nmi_restore
nmi_swapgs:
	SWITCH_USER_CR3_NO_STACK
	SWAPGS_UNSAFE_STACK
nmi_restore:
	RESTORE_EXTRA_REGS
+6 −1
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>

@@ -48,6 +49,7 @@
ENTRY(entry_SYSENTER_compat)
	/* Interrupts are off on entry. */
	SWAPGS_UNSAFE_STACK
	SWITCH_KERNEL_CR3_NO_STACK
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

	/*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
	/* Interrupts are off on entry. */
	SWAPGS_UNSAFE_STACK
	SWITCH_KERNEL_CR3_NO_STACK

	/* Stash user ESP and switch to the kernel stack. */
	movl	%esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
	xorq	%r8, %r8
	xorq	%r9, %r9
	xorq	%r10, %r10
	SWITCH_USER_CR3
	movq	RSP-ORIG_RAX(%rsp), %rsp
	swapgs
	sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
	PARAVIRT_ADJUST_EXCEPTION_FRAME
	ASM_CLAC			/* Do this early to minimize exposure */
	SWAPGS

	SWITCH_KERNEL_CR3_NO_STACK
	/*
	 * User tracing code (ptrace or signal handlers) might assume that
	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)

	/* Go back to user mode. */
	TRACE_IRQS_ON
	SWITCH_USER_CR3_NO_STACK
	SWAPGS
	jmp	restore_regs_and_iret
END(entry_INT80_compat)
+1 −1
Original line number Diff line number Diff line
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED	((void *)~0UL)

typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);

#endif /* !ASSEMBLY_ */

+113 −0
Original line number Diff line number Diff line
#ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H

/* This file includes the definitions for the KAISER feature.
 * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
 * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
 * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
 * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
 * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
 *
 * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
 * of the user space, or the stacks.
 */
#ifdef __ASSEMBLY__
#ifdef CONFIG_KAISER

.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
andq $(~0x1000), \reg
movq \reg, %cr3
.endm

.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
orq $(0x1000), \reg
movq \reg, %cr3
.endm

.macro SWITCH_KERNEL_CR3
pushq %rax
_SWITCH_TO_KERNEL_CR3 %rax
popq %rax
.endm

.macro SWITCH_USER_CR3
pushq %rax
_SWITCH_TO_USER_CR3 %rax
popq %rax
.endm

.macro SWITCH_KERNEL_CR3_NO_STACK
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
_SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm


.macro SWITCH_USER_CR3_NO_STACK

movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
_SWITCH_TO_USER_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

.endm

#else /* CONFIG_KAISER */

.macro SWITCH_KERNEL_CR3 reg
.endm
.macro SWITCH_USER_CR3 reg
.endm
.macro SWITCH_USER_CR3_NO_STACK
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm

#endif /* CONFIG_KAISER */
#else /* __ASSEMBLY__ */


#ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that
// the address space has to be switched before the registers have been stored.
// To change the address space, another register is needed.
// A register therefore has to be stored/restored.
//
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

#endif /* CONFIG_KAISER */

/**
 *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
 *  @addr: the start address of the range
 *  @size: the size of the range
 *  @flags: The mapping flags of the pages
 *
 *  the mapping is done on a global scope, so no bigger synchronization has to be done.
 *  the pages have to be manually unmapped again when they are not needed any longer.
 */
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


/**
 *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
 *  @addr: the start address of the range
 *  @size: the size of the range
 */
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

/**
 *  shadowmem_initialize_mapping - Initalize the shadow mapping
 *
 *  most parts of the shadow mapping can be mapped upon boot time.
 *  only the thread stacks have to be mapped on runtime.
 *  the mapped regions are not unmapped at all.
 */
extern void kaiser_init(void);

#endif



#endif /* _ASM_X86_KAISER_H */
+4 −0
Original line number Diff line number Diff line
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
       memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
	// clone the shadow pgd part as well
	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
Loading