Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5dea1c88 authored by Rusty Russell's avatar Rusty Russell
Browse files

lguest: use a special 1:1 linear pagetable mode until first switch.



The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was.  In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.

However, since d50d8fe1 Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump.  So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.

This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET.  (Non-Linux guests might not even have such a
thing).

Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent e0377e25
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -63,7 +63,6 @@ void foo(void)
	BLANK();
	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);

	BLANK();
	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+5 −6
Original line number Diff line number Diff line
@@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void)

/* See lguest_set_pte() below. */
static bool cr3_changed = false;
static unsigned long current_cr3;

/*
 * cr3 is the current toplevel pagetable page: the principle is the same as
 * cr0.  Keep a local copy, and tell the Host when it changes.  The only
 * difference is that our local copy is in lguest_data because the Host needs
 * to set it upon our initial hypercall.
 * cr0.  Keep a local copy, and tell the Host when it changes.
 */
static void lguest_write_cr3(unsigned long cr3)
{
	lguest_data.pgdir = cr3;
	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
	current_cr3 = cr3;

	/* These two page tables are simple, linear, and used during boot */
	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3)

static unsigned long lguest_read_cr3(void)
{
	return lguest_data.pgdir;
	return current_cr3;
}

/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
{
	/* Simply set it to zero: if it was not, it will fault back in. */
	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}

/*
+7 −2
Original line number Diff line number Diff line
@@ -27,13 +27,18 @@
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
	/*
	 * We make the "initialization" hypercall now to tell the Host about
	 * us, and also find out where it put our page tables.
	 * We make the "initialization" hypercall now to tell the Host where
	 * our lguest_data struct is.
	 */
	movl $LHCALL_LGUEST_INIT, %eax
	movl $lguest_data - __PAGE_OFFSET, %ebx
	int $LGUEST_TRAP_ENTRY

	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
	movl $LHCALL_NEW_PGTABLE, %eax
	movl $(initial_page_table - __PAGE_OFFSET), %ebx
	int $LGUEST_TRAP_ENTRY

	/* Set up the initial stack so we can run C code. */
	movl $(init_thread_union+THREAD_SIZE),%esp

+2 −0
Original line number Diff line number Diff line
@@ -59,6 +59,8 @@ struct lg_cpu {

	struct lguest_pages *last_pages;

	/* Initialization mode: linear map everything. */
	bool linear_pages;
	int cpu_pgd; /* Which pgd this cpu is currently using */

	/* If a hypercall was asked for, this points to the arguments. */
+84 −194
Original line number Diff line number Diff line
@@ -17,7 +17,6 @@
#include <linux/percpu.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/bootparam.h>
#include "lg.h"

/*M:008
@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
#endif

	/* First step: get the top-level Guest page table entry. */
	if (unlikely(cpu->linear_pages)) {
		/* Faking up a linear mapping. */
		gpgd = __pgd(CHECK_GPGD_MASK);
	} else {
		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
		/* Toplevel not present?  We can't map it in. */
		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
			return false;
	}

	/* Now look at the matching shadow entry. */
	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
	}

#ifdef CONFIG_X86_PAE
	if (unlikely(cpu->linear_pages)) {
		/* Faking up a linear mapping. */
		gpmd = __pmd(_PAGE_TABLE);
	} else {
		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
		/* Middle level not present?  We can't map it in. */
		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
			return false;
	}

	/* Now look at the matching shadow entry. */
	spmd = spmd_addr(cpu, *spgd, vaddr);
@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
#endif

	if (unlikely(cpu->linear_pages)) {
		/* Linear?  Make up a PTE which points to same page. */
		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
	} else {
		/* Read the actual PTE value. */
		gpte = lgread(cpu, gpte_ptr, pte_t);
	}

	/* If this page isn't in the Guest page tables, we can't page it in. */
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -454,6 +468,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
	 * Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
	 */
	if (likely(!cpu->linear_pages))
		lgwrite(cpu, gpte_ptr, pte_t, gpte);

	/*
@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
#ifdef CONFIG_X86_PAE
	pmd_t gpmd;
#endif

	/* Still not set up?  Just map 1:1. */
	if (unlikely(cpu->linear_pages))
		return vaddr;

	/* First step: get the top-level Guest page table entry. */
	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
	/* Toplevel not present?  We can't map it in. */
@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
	return next;
}

/*H:430
 * (iv) Switching page tables
 *
 * Now we've seen all the page table setting and manipulation, let's see
 * what happens when the Guest changes page tables (ie. changes the top-level
 * pgdir).  This occurs on almost every context switch.
 */
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
	int newpgdir, repin = 0;

	/* Look to see if we have this one already. */
	newpgdir = find_pgdir(cpu->lg, pgtable);
	/*
	 * If not, we allocate or mug an existing one: if it's a fresh one,
	 * repin gets set to 1.
	 */
	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
		newpgdir = new_pgdir(cpu, pgtable, &repin);
	/* Change the current pgd index to the new one. */
	cpu->cpu_pgd = newpgdir;
	/* If it was completely blank, we map in the Guest kernel stack */
	if (repin)
		pin_stack_pages(cpu);
}

/*H:470
 * Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
	/* We need the Guest kernel stack mapped again. */
	pin_stack_pages(cpu);
}

/*H:430
 * (iv) Switching page tables
 *
 * Now we've seen all the page table setting and manipulation, let's see
 * what happens when the Guest changes page tables (ie. changes the top-level
 * pgdir).  This occurs on almost every context switch.
 */
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
	int newpgdir, repin = 0;

	/*
	 * The very first time they call this, we're actually running without
	 * any page tables; we've been making it up.  Throw them away now.
	 */
	if (unlikely(cpu->linear_pages)) {
		release_all_pagetables(cpu->lg);
		cpu->linear_pages = false;
		/* Force allocation of a new pgdir. */
		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
	} else {
		/* Look to see if we have this one already. */
		newpgdir = find_pgdir(cpu->lg, pgtable);
	}

	/*
	 * If not, we allocate or mug an existing one: if it's a fresh one,
	 * repin gets set to 1.
	 */
	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
		newpgdir = new_pgdir(cpu, pgtable, &repin);
	/* Change the current pgd index to the new one. */
	cpu->cpu_pgd = newpgdir;
	/* If it was completely blank, we map in the Guest kernel stack */
	if (repin)
		pin_stack_pages(cpu);
}
/*:*/

/*M:009
@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
}
#endif

/*H:505
 * To get through boot, we construct simple identity page mappings (which
 * set virtual == physical) and linear mappings which will get the Guest far
 * enough into the boot to create its own.  The linear mapping means we
 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
 * as you'll see.
 *
 * We lay them out of the way, just below the initrd (which is why we need to
 * know its size here).
 */
static unsigned long setup_pagetables(struct lguest *lg,
				      unsigned long mem,
				      unsigned long initrd_size)
{
	pgd_t __user *pgdir;
	pte_t __user *linear;
	unsigned long mem_base = (unsigned long)lg->mem_base;
	unsigned int mapped_pages, i, linear_pages;
#ifdef CONFIG_X86_PAE
	pmd_t __user *pmds;
	unsigned int j;
	pgd_t pgd;
	pmd_t pmd;
#else
	unsigned int phys_linear;
#endif

	/*
	 * We have mapped_pages frames to map, so we need linear_pages page
	 * tables to map them.
	 */
	mapped_pages = mem / PAGE_SIZE;
	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;

	/* We put the toplevel page directory page at the top of memory. */
	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);

	/* Now we use the next linear_pages pages as pte pages */
	linear = (void *)pgdir - linear_pages * PAGE_SIZE;

#ifdef CONFIG_X86_PAE
	/*
	 * And the single mid page goes below that.  We only use one, but
	 * that's enough to map 1G, which definitely gets us through boot.
	 */
	pmds = (void *)linear - PAGE_SIZE;
#endif
	/*
	 * Linear mapping is easy: put every page's address into the
	 * mapping in order.
	 */
	for (i = 0; i < mapped_pages; i++) {
		pte_t pte;
		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
			return -EFAULT;
	}

#ifdef CONFIG_X86_PAE
	/*
	 * Make the Guest PMD entries point to the corresponding place in the
	 * linear mapping (up to one page worth of PMD).
	 */
	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
	     i += PTRS_PER_PTE, j++) {
		pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
			      __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));

		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
			return -EFAULT;
	}

	/* One PGD entry, pointing to that PMD page. */
	pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */
	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
		return -EFAULT;
	/*
	 * And the other PGD entry to make the linear mapping at PAGE_OFFSET
	 */
	if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
		return -EFAULT;
#else
	/*
	 * The top level points to the linear page table pages above.
	 * We setup the identity and linear mappings here.
	 */
	phys_linear = (unsigned long)linear - mem_base;
	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
		pgd_t pgd;
		/*
		 * Create a PGD entry which points to the right part of the
		 * linear PTE pages.
		 */
		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));

		/*
		 * Copy it into the PGD page at 0 and PAGE_OFFSET.
		 */
		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
					   + i / PTRS_PER_PTE],
				    &pgd, sizeof(pgd)))
			return -EFAULT;
	}
#endif

	/*
	 * We return the top level (guest-physical) address: we remember where
	 * this is to write it into lguest_data when the Guest initializes.
	 */
	return (unsigned long)pgdir - mem_base;
}

/*H:500
 * (vii) Setting up the page tables initially.
 *
 * When a Guest is first created, the Launcher tells us where the toplevel of
 * its first page table is.  We set some things up here:
 * When a Guest is first created, set initialize a shadow page table which
 * we will populate on future faults.  The Guest doesn't have any actual
 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
 * for the moment.
 */
int init_guest_pagetable(struct lguest *lg)
{
	u64 mem;
	u32 initrd_size;
	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
#ifdef CONFIG_X86_PAE
	pgd_t *pgd;
	pmd_t *pmd_table;
#endif
	/*
	 * Get the Guest memory size and the ramdisk size from the boot header
	 * located at lg->mem_base (Guest address 0).
	 */
	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
	    || get_user(initrd_size, &boot->hdr.ramdisk_size))
		return -EFAULT;
	struct lg_cpu *cpu = &lg->cpus[0];
	int allocated = 0;

	/*
	 * We start on the first shadow page table, and give it a blank PGD
	 * page.
	 */
	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
		return lg->pgdirs[0].gpgdir;
	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
	if (!lg->pgdirs[0].pgdir)
	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
	if (!allocated)
		return -ENOMEM;

#ifdef CONFIG_X86_PAE
	/* For PAE, we also create the initial mid-level. */
	pgd = lg->pgdirs[0].pgdir;
	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
	if (!pmd_table)
		return -ENOMEM;

	set_pgd(pgd + SWITCHER_PGD_INDEX,
		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
#endif

	/* This is the current page table. */
	lg->cpus[0].cpu_pgd = 0;
	/* We start with a linear mapping until the initialize. */
	cpu->linear_pages = true;
	return 0;
}

@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
		 * of virtual addresses used by the Switcher.
		 */
		|| put_user(RESERVE_MEM * 1024 * 1024,
			&cpu->lg->lguest_data->reserve_mem)
		|| put_user(cpu->lg->pgdirs[0].gpgdir,
			&cpu->lg->lguest_data->pgdir))
			    &cpu->lg->lguest_data->reserve_mem)) {
		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
		return;
	}

	/*
	 * In flush_user_mappings() we loop from 0 to
Loading