Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 23e09439 authored by Hugh Dickins's avatar Hugh Dickins Committed by Greg Kroah-Hartman
Browse files

kaiser: add "nokaiser" boot option, using ALTERNATIVE




Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime.  That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.

Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.

It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h).  I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.

Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments.  But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().

Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent cb7d8d7e
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

	nojitter	[IA-64] Disables jitter checking for ITC timers.

	nokaiser	[X86-64] Disable KAISER isolation of kernel from user.

	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver

	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
+8 −7
Original line number Diff line number Diff line
@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry)
	 * unconditionally, but we need to find out whether the reverse
	 * should be done on return (conveyed to paranoid_exit in %ebx).
	 */
	movq	%cr3, %rax
	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
	jz	2f
	orl	$2, %ebx
@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit)
	TRACE_IRQS_OFF_DEBUG
	TRACE_IRQS_IRETQ_DEBUG
#ifdef CONFIG_KAISER
	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
	jz	paranoid_exit_no_switch
	SWITCH_USER_CR3
@@ -1341,13 +1342,14 @@ ENTRY(nmi)
#ifdef CONFIG_KAISER
	/* Unconditionally use kernel CR3 for do_nmi() */
	/* %rax is saved above, so OK to clobber here */
	movq	%cr3, %rax
	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
	orq	x86_cr3_pcid_noflush, %rax
	pushq	%rax
	/* mask off "user" bit of pgd address and 12 PCID bits: */
	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
	movq	%rax, %cr3
2:
#endif
	call	do_nmi

@@ -1357,8 +1359,7 @@ ENTRY(nmi)
	 * kernel code that needs user CR3, but do we ever return
	 * to "user mode" where we need the kernel CR3?
	 */
	popq	%rax
	mov	%rax, %cr3
	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif

	/*
@@ -1585,13 +1586,14 @@ end_repeat_nmi:
#ifdef CONFIG_KAISER
	/* Unconditionally use kernel CR3 for do_nmi() */
	/* %rax is saved above, so OK to clobber here */
	movq	%cr3, %rax
	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
	orq	x86_cr3_pcid_noflush, %rax
	pushq	%rax
	/* mask off "user" bit of pgd address and 12 PCID bits: */
	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
	movq	%rax, %cr3
2:
#endif

	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
@@ -1603,8 +1605,7 @@ end_repeat_nmi:
	 * kernel code that needs user CR3, like just just before
	 * a sysret.
	 */
	popq	%rax
	mov	%rax, %cr3
	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif

	testl	%ebx, %ebx			/* swapgs needed? */
+3 −0
Original line number Diff line number Diff line
@@ -198,6 +198,9 @@
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */

/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */

/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+20 −7
Original line number Diff line number Diff line
@@ -46,28 +46,33 @@ movq \reg, %cr3
.endm

.macro SWITCH_KERNEL_CR3
pushq %rax
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
popq %rax
8:
.endm

.macro SWITCH_USER_CR3
pushq %rax
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_USER_CR3 %rax %al
popq %rax
8:
.endm

.macro SWITCH_KERNEL_CR3_NO_STACK
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
ALTERNATIVE "jmp 8f", \
	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
	X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
8:
.endm

#else /* CONFIG_KAISER */

.macro SWITCH_KERNEL_CR3 reg
.macro SWITCH_KERNEL_CR3
.endm
.macro SWITCH_USER_CR3 reg regb
.macro SWITCH_USER_CR3
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm
@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);

extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];

extern int kaiser_enabled;
#else
#define kaiser_enabled	0
#endif /* CONFIG_KAISER */

/*
 * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
 * so as to build with tests on kaiser_enabled instead of #ifdefs.
 */

/**
 *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
 *  @addr: the start address of the range
@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 */
extern void kaiser_init(void);

#endif /* CONFIG_KAISER */

#endif /* __ASSEMBLY */

#endif /* _ASM_X86_KAISER_H */
+14 −6
Original line number Diff line number Diff line
@@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>

#ifdef CONFIG_KAISER
extern int kaiser_enabled;
#else
#define kaiser_enabled 0
#endif

void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);

@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd)
	 * page table by accident; it will fault on the first
	 * instruction it tries to run.  See native_set_pgd().
	 */
	if (IS_ENABLED(CONFIG_KAISER))
	if (kaiser_enabled)
		ignore_flags |= _PAGE_NX;

	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -915,10 +921,12 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
	memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
	if (kaiser_enabled) {
		/* Clone the shadow pgd part as well */
		memcpy(native_get_shadow_pgd(dst),
			native_get_shadow_pgd(src),
			count * sizeof(pgd_t));
	}
#endif
}

Loading