Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3e2b68d7 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar
Browse files

x86/asm, sched/x86: Rewrite the FS and GS context switch code



The old code was incomprehensible and was buggy on AMD CPUs.

Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Reviewed-by: default avatarBorislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rudolf Marek <r.marek@assembler.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5f6bde874c6fe6831c6711b5b1522a238ba035b4.1460075211.git.luto@kernel.org


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 7a5d6704
Loading
Loading
Loading
Loading
+93 −55
Original line number Diff line number Diff line
@@ -282,7 +282,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
	struct fpu *next_fpu = &next->fpu;
	int cpu = smp_processor_id();
	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
	unsigned fsindex, gsindex;
	unsigned prev_fsindex, prev_gsindex;
	fpu_switch_t fpu_switch;

	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -292,8 +292,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
	 *
	 * (e.g. xen_load_tls())
	 */
	savesegment(fs, fsindex);
	savesegment(gs, gsindex);
	savesegment(fs, prev_fsindex);
	savesegment(gs, prev_gsindex);

	/*
	 * Load TLS before restoring any segments so that segment loads
@@ -336,66 +336,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
	 * Switch FS and GS.
	 *
	 * These are even more complicated than DS and ES: they have
	 * 64-bit bases are that controlled by arch_prctl.  Those bases
	 * only differ from the values in the GDT or LDT if the selector
	 * is 0.
	 * 64-bit bases are that controlled by arch_prctl.  The bases
	 * don't necessarily match the selectors, as user code can do
	 * any number of things to cause them to be inconsistent.
	 *
	 * Loading the segment register resets the hidden base part of
	 * the register to 0 or the value from the GDT / LDT.  If the
	 * next base address zero, writing 0 to the segment register is
	 * much faster than using wrmsr to explicitly zero the base.
	 * We don't promise to preserve the bases if the selectors are
	 * nonzero.  We also don't promise to preserve the base if the
	 * selector is zero and the base doesn't match whatever was
	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
	 * FSGSBASE instructions are enabled, we'll need to offer
	 * stronger guarantees.)
	 *
	 * The thread_struct.fs and thread_struct.gs values are 0
	 * if the fs and gs bases respectively are not overridden
	 * from the values implied by fsindex and gsindex.  They
	 * are nonzero, and store the nonzero base addresses, if
	 * the bases are overridden.
	 *
	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
	 * be impossible.
	 *
	 * Therefore we need to reload the segment registers if either
	 * the old or new selector is nonzero, and we need to override
	 * the base address if next thread expects it to be overridden.
	 *
	 * This code is unnecessarily slow in the case where the old and
	 * new indexes are zero and the new base is nonzero -- it will
	 * unnecessarily write 0 to the selector before writing the new
	 * base address.
	 *
	 * Note: This all depends on arch_prctl being the only way that
	 * user code can override the segment base.  Once wrfsbase and
	 * wrgsbase are enabled, most of this code will need to change.
	 * As an invariant,
	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) is
	 * impossible.
	 */
	if (unlikely(fsindex | next->fsindex | prev->fs)) {
	if (next->fsindex) {
		/* Loading a nonzero value into FS sets the index and base. */
		loadsegment(fs, next->fsindex);

	} else {
		if (next->fs) {
			/* Next index is zero but next base is nonzero. */
			if (prev_fsindex)
				loadsegment(fs, 0);
			wrmsrl(MSR_FS_BASE, next->fs);
		} else {
			/* Next base and index are both zero. */
			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
				/*
		 * If user code wrote a nonzero value to FS, then it also
		 * cleared the overridden base address.
		 *
		 * XXX: if user code wrote 0 to FS and cleared the base
		 * address itself, we won't notice and we'll incorrectly
		 * restore the prior base address next time we reschdule
		 * the process.
				 * We don't know the previous base and can't
				 * find out without RDMSR.  Forcibly clear it.
				 */
		if (fsindex)
			prev->fs = 0;
				loadsegment(fs, __USER_DS);
				loadsegment(fs, 0);
			} else {
				/*
				 * If the previous index is zero and ARCH_SET_FS
				 * didn't change the base, then the base is
				 * also zero and we don't need to do anything.
				 */
				if (prev->fs || prev_fsindex)
					loadsegment(fs, 0);
			}
	if (next->fs)
		wrmsrl(MSR_FS_BASE, next->fs);
	prev->fsindex = fsindex;
		}
	}
	/*
	 * Save the old state and preserve the invariant.
	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
	 * without RDMSR because Intel user code can zero it without telling
	 * us and AMD user code can program any 32-bit value without telling
	 * us.
	 */
	if (prev_fsindex)
		prev->fs = 0;
	prev->fsindex = prev_fsindex;

	if (unlikely(gsindex | next->gsindex | prev->gs)) {
	if (next->gsindex) {
		/* Loading a nonzero value into GS sets the index and base. */
		load_gs_index(next->gsindex);

		/* This works (and fails) the same way as fsindex above. */
		if (gsindex)
			prev->gs = 0;
	}
	if (next->gs)
	} else {
		if (next->gs) {
			/* Next index is zero but next base is nonzero. */
			if (prev_gsindex)
				load_gs_index(0);
			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
	prev->gsindex = gsindex;
		} else {
			/* Next base and index are both zero. */
			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
				/*
				 * We don't know the previous base and can't
				 * find out without RDMSR.  Forcibly clear it.
				 *
				 * This contains a pointless SWAPGS pair.
				 * Fixing it would involve an explicit check
				 * for Xen or a new pvop.
				 */
				load_gs_index(__USER_DS);
				load_gs_index(0);
			} else {
				/*
				 * If the previous index is zero and ARCH_SET_GS
				 * didn't change the base, then the base is
				 * also zero and we don't need to do anything.
				 */
				if (prev->gs || prev_gsindex)
					load_gs_index(0);
			}
		}
	}
	/*
	 * Save the old state and preserve the invariant.
	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
	 * without RDMSR because Intel user code can zero it without telling
	 * us and AMD user code can program any 32-bit value without telling
	 * us.
	 */
	if (prev_gsindex)
		prev->gs = 0;
	prev->gsindex = prev_gsindex;

	switch_fpu_finish(next_fpu, fpu_switch);