Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f95d47ca authored by Jeremy Fitzhardinge's avatar Jeremy Fitzhardinge Committed by Andi Kleen
Browse files

[PATCH] i386: Use %gs as the PDA base-segment in the kernel



This patch is the meat of the PDA change.  This patch makes several related
changes:

1: Most significantly, %gs is now used in the kernel.  This means that on
   entry, the old value of %gs is saved away, and it is reloaded with
   __KERNEL_PDA.

2: entry.S constructs the stack in the shape of struct pt_regs, and this
   is passed around the kernel so that the process's saved register
   state can be accessed.

   Unfortunately struct pt_regs doesn't currently have space for %gs
   (or %fs). This patch extends pt_regs to add space for gs (no space
   is allocated for %fs, since it won't be used, and it would just
   complicate the code in entry.S to work around the space).

3: Because %gs is now saved on the stack like %ds, %es and the integer
   registers, there are a number of places where it no longer needs to
   be handled specially; namely context switch, and saving/restoring the
   register state in a signal context.

4: And since kernel threads run in kernel space and call normal kernel
   code, they need to be created with their %gs == __KERNEL_PDA.

Signed-off-by: default avatarJeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: default avatarAndi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
parent 62111195
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -72,6 +72,7 @@ void foo(void)
	OFFSET(PT_EAX, pt_regs, eax);
	OFFSET(PT_DS,  pt_regs, xds);
	OFFSET(PT_ES,  pt_regs, xes);
	OFFSET(PT_GS,  pt_regs, xgs);
	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
	OFFSET(PT_EIP, pt_regs, eip);
	OFFSET(PT_CS,  pt_regs, xcs);
+19 −2
Original line number Diff line number Diff line
@@ -593,6 +593,14 @@ void __init early_cpu_init(void)
#endif
}

/* Make sure %gs is initialized properly in idle threads */
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
	memset(regs, 0, sizeof(struct pt_regs));
	regs->xgs = __KERNEL_PDA;
	return regs;
}

__cpuinit int alloc_gdt(int cpu)
{
	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
@@ -644,6 +652,14 @@ struct i386_pda boot_pda = {
	._pda = &boot_pda,
};

static inline void set_kernel_gs(void)
{
	/* Set %gs for this CPU's PDA.  Memory clobber is to create a
	   barrier with respect to any PDA operations, so the compiler
	   doesn't move any before here. */
	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
}

/* Initialize the CPU's GDT and PDA.  The boot CPU does this for
   itself, but secondaries find this done for them. */
__cpuinit int init_gdt(int cpu, struct task_struct *idle)
@@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
	   the boot CPU, this will transition from the boot gdt+pda to
	   the real ones). */
	load_gdt(cpu_gdt_descr);
	set_kernel_gs();

	if (cpu_test_and_set(cpu, cpu_initialized)) {
		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif

	/* Clear %fs and %gs. */
	asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
	/* Clear %fs. */
	asm volatile ("mov %0, %%fs" : : "r" (0));

	/* Clear all 6 debug registers: */
	set_debugreg(0, 0);
+48 −22
Original line number Diff line number Diff line
@@ -30,12 +30,13 @@
 *	18(%esp) - %eax
 *	1C(%esp) - %ds
 *	20(%esp) - %es
 *	24(%esp) - orig_eax
 *	28(%esp) - %eip
 *	2C(%esp) - %cs
 *	30(%esp) - %eflags
 *	34(%esp) - %oldesp
 *	38(%esp) - %oldss
 *	24(%esp) - %gs
 *	28(%esp) - orig_eax
 *	2C(%esp) - %eip
 *	30(%esp) - %cs
 *	34(%esp) - %eflags
 *	38(%esp) - %oldesp
 *	3C(%esp) - %oldss
 *
 * "current" is in register %ebx during any slow entries.
 */
@@ -92,6 +93,9 @@ VM_MASK = 0x00020000

#define SAVE_ALL \
	cld; \
	pushl %gs; \
	CFI_ADJUST_CFA_OFFSET 4;\
	/*CFI_REL_OFFSET gs, 0;*/\
	pushl %es; \
	CFI_ADJUST_CFA_OFFSET 4;\
	/*CFI_REL_OFFSET es, 0;*/\
@@ -121,7 +125,9 @@ VM_MASK = 0x00020000
	CFI_REL_OFFSET ebx, 0;\
	movl $(__USER_DS), %edx; \
	movl %edx, %ds; \
	movl %edx, %es;
	movl %edx, %es; \
	movl $(__KERNEL_PDA), %edx; \
	movl %edx, %gs

#define RESTORE_INT_REGS \
	popl %ebx;	\
@@ -154,17 +160,22 @@ VM_MASK = 0x00020000
2:	popl %es;	\
	CFI_ADJUST_CFA_OFFSET -4;\
	/*CFI_RESTORE es;*/\
.section .fixup,"ax";	\
3:	movl $0,(%esp);	\
	jmp 1b;		\
3:	popl %gs;	\
	CFI_ADJUST_CFA_OFFSET -4;\
	/*CFI_RESTORE gs;*/\
.pushsection .fixup,"ax";	\
4:	movl $0,(%esp);	\
	jmp 1b;		\
5:	movl $0,(%esp);	\
	jmp 2b;		\
.previous;		\
6:	movl $0,(%esp);	\
	jmp 3b;		\
.section __ex_table,"a";\
	.align 4;	\
	.long 1b,3b;	\
	.long 2b,4b;	\
.previous
	.long 1b,4b;	\
	.long 2b,5b;	\
	.long 3b,6b;	\
.popsection

#define RING0_INT_FRAME \
	CFI_STARTPROC simple;\
@@ -231,6 +242,7 @@ check_userspace:
	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
	cmpl $USER_RPL, %eax
	jb resume_kernel		# not returning to v8086 or userspace

ENTRY(resume_userspace)
 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
					# setting need_resched or sigpending
@@ -327,9 +339,16 @@ sysenter_past_esp:
	movl PT_OLDESP(%esp), %ecx
	xorl %ebp,%ebp
	TRACE_IRQS_ON
1:	mov  PT_GS(%esp), %gs
	ENABLE_INTERRUPTS_SYSEXIT
	CFI_ENDPROC

.pushsection .fixup,"ax"
2:	movl $0,PT_GS(%esp)
	jmp 1b
.section __ex_table,"a"
	.align 4
	.long 1b,2b
.popsection

	# system call handler stub
ENTRY(system_call)
@@ -375,7 +394,7 @@ restore_nocheck:
	TRACE_IRQS_IRET
restore_nocheck_notrace:
	RESTORE_REGS
	addl $4, %esp
	addl $4, %esp			# skip orig_eax/error_code
	CFI_ADJUST_CFA_OFFSET -4
1:	INTERRUPT_RETURN
.section .fixup,"ax"
@@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault)
	CFI_ADJUST_CFA_OFFSET 4
	ALIGN
error_code:
	/* the function address is in %gs's slot on the stack */
	pushl %es
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET es, 0*/
	pushl %ds
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET ds, 0*/
@@ -613,18 +636,20 @@ error_code:
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ebx, 0
	cld
	pushl %es
	pushl %gs
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET es, 0*/
	/*CFI_REL_OFFSET gs, 0*/
	movl $(__KERNEL_PDA), %ecx
	movl %ecx, %gs
	UNWIND_ESPFIX_STACK
	popl %ecx
	CFI_ADJUST_CFA_OFFSET -4
	/*CFI_REGISTER es, ecx*/
	movl PT_ES(%esp), %edi		# get the function address
	movl PT_GS(%esp), %edi		# get the function address
	movl PT_ORIG_EAX(%esp), %edx	# get the error code
	movl $-1, PT_ORIG_EAX(%esp)
	movl %ecx, PT_ES(%esp)
	/*CFI_REL_OFFSET es, ES*/
	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
	mov  %ecx, PT_GS(%esp)
	/*CFI_REL_OFFSET gs, ES*/
	movl $(__USER_DS), %ecx
	movl %ecx, %ds
	movl %ecx, %es
@@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running)
	movl	%ebx, PT_EAX(%edx)
	movl	$__USER_DS, PT_DS(%edx)
	movl	$__USER_DS, PT_ES(%edx)
	movl	$0, PT_GS(%edx)
	movl	%ebx, PT_ORIG_EAX(%edx)
	movl	%ecx, PT_EIP(%edx)
	movl	12(%esp), %ecx
+27 −4
Original line number Diff line number Diff line
@@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP
	movl %eax,%cr0

	call check_x87
	call setup_pda
	lgdt cpu_gdt_descr
	lidt idt_descr
	ljmp $(__KERNEL_CS),$1f
@@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP
	movl %eax,%ds
	movl %eax,%es

	xorl %eax,%eax			# Clear FS/GS and LDT
	xorl %eax,%eax			# Clear FS and LDT
	movl %eax,%fs
	movl %eax,%gs
	lldt %ax

	movl $(__KERNEL_PDA),%eax
	mov  %eax,%gs

	cld			# gcc2 wants the direction flag cleared at all times
	pushl $0		# fake return address for unwinder
#ifdef CONFIG_SMP
@@ -345,6 +349,23 @@ check_x87:
	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
	ret

/*
 * Point the GDT at this CPU's PDA.  On boot this will be
 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
 * that CPU's GDT and PDA.
 */
setup_pda:
	/* get the PDA pointer */
	movl start_pda, %eax

	/* slot the PDA address into the GDT */
	mov cpu_gdt_descr+2, %ecx
	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
	shr $16, %eax
	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
	mov %ah, (__KERNEL_PDA+4+3)(%ecx)		/* base & 0xff000000 */
	ret

/*
 *  setup_idt
 *
@@ -484,6 +505,8 @@ ENTRY(empty_zero_page)
 * This starts the data section.
 */
.data
ENTRY(start_pda)
	.long boot_pda

ENTRY(stack_start)
	.long init_thread_union+THREAD_SIZE
@@ -525,7 +548,7 @@ idt_descr:

# boot GDT descriptor (later on used by CPU#0):
	.word 0				# 32 bit align gdt_desc.address
cpu_gdt_descr:
ENTRY(cpu_gdt_descr)
	.word GDT_ENTRIES*8-1
	.long cpu_gdt_table

@@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table)
	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */

	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
	.quad 0x0000000000000000	/* 0xd8 - PDA */
	.quad 0x00cf92000000ffff	/* 0xd8 - PDA */
	.quad 0x0000000000000000	/* 0xe0 - unused */
	.quad 0x0000000000000000	/* 0xe8 - unused */
	.quad 0x0000000000000000	/* 0xf0 - unused */
+12 −14
Original line number Diff line number Diff line
@@ -56,6 +56,7 @@

#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/pda.h>

asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");

@@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)

	regs.xds = __USER_DS;
	regs.xes = __USER_DS;
	regs.xgs = __KERNEL_PDA;
	regs.orig_eax = -1;
	regs.eip = (unsigned long) kernel_thread_helper;
	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
	p->thread.eip = (unsigned long) ret_from_fork;

	savesegment(fs,p->thread.fs);
	savesegment(gs,p->thread.gs);

	tsk = current;
	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
	load_esp0(tss, next);

	/*
	 * Save away %fs and %gs. No need to save %es and %ds, as
	 * those are always kernel segments while inside the kernel.
	 * Doing this before setting the new TLS descriptors avoids
	 * the situation where we temporarily have non-reloadable
	 * segments in %fs and %gs.  This could be an issue if the
	 * NMI handler ever used %fs or %gs (it does not today), or
	 * if the kernel is running inside of a hypervisor layer.
	 * Save away %fs. No need to save %gs, as it was saved on the
	 * stack on entry.  No need to save %es and %ds, as those are
	 * always kernel segments while inside the kernel.  Doing this
	 * before setting the new TLS descriptors avoids the situation
	 * where we temporarily have non-reloadable segments in %fs
	 * and %gs.  This could be an issue if the NMI handler ever
	 * used %fs or %gs (it does not today), or if the kernel is
	 * running inside of a hypervisor layer.
	 */
	savesegment(fs, prev->fs);
	savesegment(gs, prev->gs);

	/*
	 * Load the per-thread Thread-Local Storage descriptor.
@@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
	load_TLS(next, cpu);

	/*
	 * Restore %fs and %gs if needed.
	 * Restore %fs if needed.
	 *
	 * Glibc normally makes %fs be zero, and %gs is one of
	 * the TLS segments.
	 * Glibc normally makes %fs be zero.
	 */
	if (unlikely(prev->fs | next->fs))
		loadsegment(fs, next->fs);

	if (prev->gs | next->gs)
		loadsegment(gs, next->gs);

	/*
	 * Restore IOPL if needed.
Loading