Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1e423bff authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar
Browse files

x86/entry/64: Migrate the 64-bit syscall slow path to C



This is more complicated than the 32-bit and compat cases
because it preserves an asm fast path for the case where the
callee-saved regs aren't needed in pt_regs and no entry or exit
work needs to be done.

This appears to slow down fastpath syscalls by no more than one
cycle on my Skylake laptop.

Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 24d978b7
Loading
Loading
Loading
Loading
+26 −0
Original line number Diff line number Diff line
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
	prepare_exit_to_usermode(regs);
}

#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs)
{
	struct thread_info *ti = pt_regs_to_thread_info(regs);
	unsigned long nr = regs->orig_ax;

	local_irq_enable();

	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
		nr = syscall_trace_enter(regs);

	/*
	 * NB: Native and x32 syscalls are dispatched from the same
	 * table.  The only functional difference is the x32 bit in
	 * regs->orig_ax, which changes the behavior of some syscalls.
	 */
	if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
		regs->ax = sys_call_table[nr & __SYSCALL_MASK](
			regs->di, regs->si, regs->dx,
			regs->r10, regs->r8, regs->r9);
	}

	syscall_return_slowpath(regs);
}
#endif

#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
 * Does a 32-bit syscall.  Called with IRQs on and does all entry and
+39 −78
Original line number Diff line number Diff line
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
	movq	%rsp, PER_CPU_VAR(rsp_scratch)
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

	TRACE_IRQS_OFF

	/* Construct struct pt_regs on stack */
	pushq	$__USER_DS			/* pt_regs->ss */
	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
	/*
	 * Re-enable interrupts.
	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
	 * must execute atomically in the face of possible interrupt-driven
	 * task preemption. We must enable interrupts only after we're done
	 * with using rsp_scratch:
	 */
	ENABLE_INTERRUPTS(CLBR_NONE)
	pushq	%r11				/* pt_regs->flags */
	pushq	$__USER_CS			/* pt_regs->cs */
	pushq	%rcx				/* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
	pushq	%r11				/* pt_regs->r11 */
	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */

	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	tracesys
	/*
	 * If we need to do entry work or if we guess we'll need to do
	 * exit work, go straight to the slow path.
	 */
	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	entry_SYSCALL64_slow_path

entry_SYSCALL_64_fastpath:
	/*
	 * Easy case: enable interrupts and issue the syscall.  If the syscall
	 * needs pt_regs, we'll call a stub that disables interrupts again
	 * and jumps to the slow path.
	 */
	TRACE_IRQS_ON
	ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
	cmpq	$__NR_syscall_max, %rax
#else
@@ -193,88 +199,43 @@ entry_SYSCALL_64_fastpath:

	movq	%rax, RAX(%rsp)
1:
/*
 * Syscall return path ending with SYSRET (fast path).
 * Has incompletely filled pt_regs.
 */
	LOCKDEP_SYS_EXIT
	/*
	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
	 * it is too small to ever cause noticeable irq latency.
	 */
	DISABLE_INTERRUPTS(CLBR_NONE)

	/*
	 * We must check ti flags with interrupts (or at least preemption)
	 * off because we must *never* return to userspace without
	 * processing exit work that is enqueued if we're preempted here.
	 * In particular, returning to userspace with any of the one-shot
	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
	 * very bad.
	 * If we get here, then we know that pt_regs is clean for SYSRET64.
	 * If we see that no exit work is required (which we are required
	 * to check with IRQs off), then we can go straight to SYSRET64.
	 */
	DISABLE_INTERRUPTS(CLBR_NONE)
	TRACE_IRQS_OFF
	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
	jnz	1f

	RESTORE_C_REGS_EXCEPT_RCX_R11
	movq	RIP(%rsp), %rcx
	movq	EFLAGS(%rsp), %r11
	LOCKDEP_SYS_EXIT
	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
	RESTORE_C_REGS
	movq	RSP(%rsp), %rsp
	/*
	 * 64-bit SYSRET restores rip from rcx,
	 * rflags from r11 (but RF and VM bits are forced to 0),
	 * cs and ss are loaded from MSRs.
	 * Restoration of rflags re-enables interrupts.
	 *
	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
	 * descriptor is not reinitialized.  This means that we should
	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
	 * exit the kernel, and re-enter using an interrupt vector.  (All
	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
	 * from happening by reloading SS in __switch_to.  (Actually
	 * detecting the failure in 64-bit userspace is tricky but can be
	 * done.)
	 */
	USERGS_SYSRET64

GLOBAL(int_ret_from_sys_call_irqs_off)
1:
	/*
	 * The fast path looked good when we started, but something changed
	 * along the way and we need to switch to the slow path.  Calling
	 * raise(3) will trigger this, for example.  IRQs are off.
	 */
	TRACE_IRQS_ON
	ENABLE_INTERRUPTS(CLBR_NONE)
	jmp int_ret_from_sys_call

	/* Do syscall entry tracing */
tracesys:
	SAVE_EXTRA_REGS
	movq	%rsp, %rdi
	call	syscall_trace_enter

	/*
	 * Reload registers from stack in case ptrace changed them.
	 * We don't reload %rax because syscall_trace_enter() returned
	 * the value it wants us to use in the table lookup.
	 */
	RESTORE_C_REGS_EXCEPT_RAX
#if __SYSCALL_MASK == ~0
	cmpq	$__NR_syscall_max, %rax
#else
	andl	$__SYSCALL_MASK, %eax
	cmpl	$__NR_syscall_max, %eax
#endif
	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
	movq	%r10, %rcx			/* fixup for C */
	call	*sys_call_table(, %rax, 8)
	movq	%rax, RAX(%rsp)
	RESTORE_EXTRA_REGS
1:
	/* Use IRET because user could have changed pt_regs->foo */
	call	syscall_return_slowpath	/* returns with IRQs disabled */
	jmp	return_from_SYSCALL_64

/*
 * Syscall return path ending with IRET.
 * Has correct iret frame.
 */
GLOBAL(int_ret_from_sys_call)
entry_SYSCALL64_slow_path:
	/* IRQs are off. */
	SAVE_EXTRA_REGS
	movq	%rsp, %rdi
	call	syscall_return_slowpath	/* returns with IRQs disabled */
	call	do_syscall_64		/* returns with IRQs disabled */

return_from_SYSCALL_64:
	RESTORE_EXTRA_REGS
	TRACE_IRQS_IRETQ		/* we're about to change IF */

@@ -364,7 +325,7 @@ ENTRY(stub_ptregs_64)

	/* Called from fast path -- pop return address and jump to slow path */
	popq	%rax
	jmp	tracesys	/* called from fast path */
	jmp	entry_SYSCALL64_slow_path	/* called from fast path */

1:
	/* Called from C */