Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f4f97b3e authored by Jeremy Fitzhardinge's avatar Jeremy Fitzhardinge Committed by Jeremy Fitzhardinge
Browse files

xen: Complete pagetable pinning



Xen requires all active pagetables to be marked read-only.  When the
base of the pagetable is loaded into %cr3, the hypervisor validates
the entire pagetable and only allows the load to proceed if it all
checks out.

This is pretty slow, so to mitigate this cost Xen has a notion of
pinned pagetables.  Pinned pagetables are pagetables which are
considered to be active even if no processor's cr3 is pointing to is.
This means that it must remain read-only and all updates are validated
by the hypervisor.  This makes context switches much cheaper, because
the hypervisor doesn't need to revalidate the pagetable each time.

This also adds a new paravirt hook which is called during setup once
the zones and memory allocator have been initialized.  When the
init_mm pagetable is first built, the struct page array does not yet
exist, and so there's nowhere to put he init_mm pagetable's PG_pinned
flags.  Once the zones are initialized and the struct page array
exists, we can set the PG_pinned flags for those pages.

This patch also adds the Xen support for pte pages allocated out of
highmem (highpte) by implementing xen_kmap_atomic_pte.

Signed-off-by: default avatarJeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: default avatarChris Wright <chrisw@sous-sol.org>
Cc: Zach Amsden <zach@vmware.com>
parent c85b04c3
Loading
Loading
Loading
Loading
+69 −18
Original line number Diff line number Diff line
@@ -21,6 +21,9 @@
#include <linux/sched.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>

#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
@@ -500,32 +503,59 @@ static void xen_write_cr3(unsigned long cr3)
	}
}

static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
/* Early in boot, while setting up the initial pagetable, assume
   everything is pinned. */
static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
{
	/* XXX pfn isn't necessarily a lowmem page */
	BUG_ON(mem_map);	/* should only be used early */
	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}

static void xen_alloc_pd(u32 pfn)
/* This needs to make sure the new pte page is pinned iff its being
   attached to a pinned pagetable. */
static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
{
	struct page *page = pfn_to_page(pfn);

	if (PagePinned(virt_to_page(mm->pgd))) {
		SetPagePinned(page);

		if (!PageHighMem(page))
			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
		else
			/* make sure there are no stray mappings of
			   this page */
			kmap_flush_unused();
	}

static void xen_release_pd(u32 pfn)
{
	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}

/* This should never happen until we're OK to use struct page */
static void xen_release_pt(u32 pfn)
{
	struct page *page = pfn_to_page(pfn);

	if (PagePinned(page)) {
		if (!PageHighMem(page))
			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
	}
}

static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn,
					u32 start, u32 count)
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
	xen_alloc_pd(pfn);
	pgprot_t prot = PAGE_KERNEL;

	if (PagePinned(page))
		prot = PAGE_KERNEL_RO;

	if (0 && PageHighMem(page))
		printk("mapping highpte %lx type %d prot %s\n",
		       page_to_pfn(page), type,
		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");

	return kmap_atomic_prot(page, type, prot);
}
#endif

static __init void xen_pagetable_setup_start(pgd_t *base)
{
@@ -553,7 +583,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
				       PAGE_SIZE);

				xen_alloc_pd(PFN_DOWN(__pa(pmd)));
				make_lowmem_page_readonly(pmd);

				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
			} else
@@ -574,6 +604,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base)

static __init void xen_pagetable_setup_done(pgd_t *base)
{
	/* This will work as long as patching hasn't happened yet
	   (which it hasn't) */
	paravirt_ops.alloc_pt = xen_alloc_pt;

	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
		/*
		 * Create a mapping for the shared info page.
@@ -591,7 +625,19 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
		HYPERVISOR_shared_info =
			(struct shared_info *)__va(xen_start_info->shared_info);

	xen_pgd_pin(base);
	/* Actually pin the pagetable down, but we can't set PG_pinned
	   yet because the page structures don't exist yet. */
	{
		struct mmuext_op op;
#ifdef CONFIG_X86_PAE
		op.cmd = MMUEXT_PIN_L3_TABLE;
#else
		op.cmd = MMUEXT_PIN_L3_TABLE;
#endif
		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
			BUG();
	}

	xen_vcpu_setup(smp_processor_id());
}
@@ -608,6 +654,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
	.memory_setup = xen_memory_setup,
	.arch_setup = xen_arch_setup,
	.init_IRQ = xen_init_IRQ,
	.post_allocator_init = xen_mark_init_mm_pinned,

	.time_init = xen_time_init,
	.set_wallclock = xen_set_wallclock,
@@ -688,11 +735,15 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
	.pagetable_setup_start = xen_pagetable_setup_start,
	.pagetable_setup_done = xen_pagetable_setup_done,

	.alloc_pt = xen_alloc_pt,
	.alloc_pd = xen_alloc_pd,
	.alloc_pd_clone = xen_alloc_pd_clone,
	.release_pd = xen_release_pd,
	.alloc_pt = xen_alloc_pt_init,
	.release_pt = xen_release_pt,
	.alloc_pd = paravirt_nop,
	.alloc_pd_clone = paravirt_nop,
	.release_pd = paravirt_nop,

#ifdef CONFIG_HIGHPTE
	.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif

	.set_pte = xen_set_pte,
	.set_pte_at = xen_set_pte_at,
+170 −90
Original line number Diff line number Diff line
@@ -38,19 +38,22 @@
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
#include <linux/highmem.h>
#include <linux/bug.h>
#include <linux/sched.h>

#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>

#include <asm/xen/hypercall.h>
#include <asm/paravirt.h>
#include <asm/xen/hypervisor.h>

#include <xen/page.h>
#include <xen/interface/xen.h>

#include "multicalls.h"
#include "mmu.h"

xmaddr_t arbitrary_virt_to_machine(unsigned long address)
@@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr)
}


void xen_set_pte(pte_t *ptep, pte_t pte)
{
	struct mmu_update u;

	u.ptr = virt_to_machine(ptep).maddr;
	u.val = pte_val_ma(pte);
	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
		BUG();
}

void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
	struct mmu_update u;
@@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
		BUG();
}

#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
	struct mmu_update u;

	u.ptr = virt_to_machine(ptr).maddr;
	u.val = pud_val_ma(val);
	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
		BUG();
}
#endif

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
@@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
}

#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
	struct mmu_update u;

	u.ptr = virt_to_machine(ptr).maddr;
	u.val = pud_val_ma(val);
	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
		BUG();
}

void xen_set_pte(pte_t *ptep, pte_t pte)
{
	ptep->pte_high = pte.pte_high;
	smp_wmb();
	ptep->pte_low = pte.pte_low;
}

void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
	set_64bit((u64 *)ptep, pte_val_ma(pte));
@@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd)
	return (pgd_t){ pgd };
}
#else  /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
	*ptep = pte;
}

unsigned long xen_pte_val(pte_t pte)
{
	unsigned long ret = pte.pte_low;
@@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte)
	return ret;
}

unsigned long xen_pmd_val(pmd_t pmd)
{
	/* a BUG here is a lot easier to track down than a NULL eip */
	BUG();
	return 0;
}

unsigned long xen_pgd_val(pgd_t pgd)
{
	unsigned long ret = pgd.pgd;
@@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte)
	return (pte_t){ pte };
}

pmd_t xen_make_pmd(unsigned long pmd)
{
	/* a BUG here is a lot easier to track down than a NULL eip */
	BUG();
	return __pmd(0);
}

pgd_t xen_make_pgd(unsigned long pgd)
{
	if (pgd & _PAGE_PRESENT)
@@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd)



static void pgd_walk_set_prot(void *pt, pgprot_t flags)
/*
  (Yet another) pagetable walker.  This one is intended for pinning a
  pagetable.  This means that it walks a pagetable and calls the
  callback function on each page it finds making up the page table,
  at every level.  It walks the entire pagetable, but it only bothers
  pinning pte pages which are below pte_limit.  In the normal case
  this will be TASK_SIZE, but at boot we need to pin up to
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
		    unsigned long limit)
{
	unsigned long pfn = PFN_DOWN(__pa(pt));
	pgd_t *pgd = pgd_base;
	int flush = 0;
	unsigned long addr = 0;
	unsigned long pgd_next;

	if (HYPERVISOR_update_va_mapping((unsigned long)pt,
					 pfn_pte(pfn, flags), 0) < 0)
		BUG();
}
	BUG_ON(limit > FIXADDR_TOP);

static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
{
	pgd_t *pgd = pgd_base;
	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;

	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
		pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
	int    g, u, m;
		unsigned long pud_limit, pud_next;

	if (xen_feature(XENFEAT_auto_translated_physmap))
		return;
		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);

	for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
		if (pgd_none(*pgd))
		if (!pgd_val(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (PTRS_PER_PUD > 1) /* not folded */
			pgd_walk_set_prot(pud, flags);
			flush |= (*func)(virt_to_page(pud), 0);

		for (; addr != pud_limit; pud++, addr = pud_next) {
			pmd_t *pmd;
			unsigned long pmd_limit;

			pud_next = pud_addr_end(addr, pud_limit);

			if (pud_next < limit)
				pmd_limit = pud_next;
			else
				pmd_limit = limit;

		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
			if (pud_none(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			if (PTRS_PER_PMD > 1) /* not folded */
				pgd_walk_set_prot(pmd, flags);
				flush |= (*func)(virt_to_page(pmd), 0);

			for (; addr != pmd_limit; pmd++) {
				addr += (PAGE_SIZE * PTRS_PER_PTE);
				if ((pmd_limit-1) < (addr-1)) {
					addr = pmd_limit;
					break;
				}

			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
				if (pmd_none(*pmd))
					continue;

				/* This can get called before mem_map
				   is set up, so we assume nothing is
				   highmem at that point. */
				if (mem_map == NULL ||
				    !PageHighMem(pmd_page(*pmd))) {
					pte = pte_offset_kernel(pmd, 0);
					pgd_walk_set_prot(pte, flags);
				flush |= (*func)(pmd_page(*pmd), 0);
			}
		}
	}

	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);

	return flush;
}

	if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
					 pfn_pte(PFN_DOWN(__pa(pgd_base)),
						 flags),
					 UVMF_TLB_FLUSH) < 0)
		BUG();
static int pin_page(struct page *page, unsigned flags)
{
	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	int flush;

	if (pgfl)
		flush = 0;		/* already pinned */
	else if (PageHighMem(page))
		/* kmaps need flushing if we found an unpinned
		   highpage */
		flush = 1;
	else {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);

		flush = 0;

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL_RO),
					flags);
	}

	return flush;
}

/* This is called just after a mm has been duplicated from its parent,
   but it has not been used yet.  We need to make sure that its
   pagetable is all read-only, and can be pinned. */
/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
	struct mmuext_op op;
	struct multicall_space mcs;
	struct mmuext_op *op;

	xen_mc_batch();

	pgd_walk(pgd, PAGE_KERNEL_RO);
	if (pgd_walk(pgd, pin_page, TASK_SIZE))
		kmap_flush_unused();

#if defined(CONFIG_X86_PAE)
	op.cmd = MMUEXT_PIN_L3_TABLE;
	mcs = __xen_mc_entry(sizeof(*op));
	op = mcs.args;

#ifdef CONFIG_X86_PAE
	op->cmd = MMUEXT_PIN_L3_TABLE;
#else
	op.cmd = MMUEXT_PIN_L2_TABLE;
	op->cmd = MMUEXT_PIN_L2_TABLE;
#endif
	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
		BUG();
	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

	xen_mc_issue(0);
}

/* Release a pagetables pages back as normal RW */
void xen_pgd_unpin(pgd_t *pgd)
/* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
static __init int mark_pinned(struct page *page, unsigned flags)
{
	struct mmuext_op op;
	SetPagePinned(page);
	return 0;
}

	op.cmd = MMUEXT_UNPIN_TABLE;
	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
void __init xen_mark_init_mm_pinned(void)
{
	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}

	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
		BUG();
static int unpin_page(struct page *page, unsigned flags)
{
	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);

	pgd_walk(pgd, PAGE_KERNEL);
	if (pgfl && !PageHighMem(page)) {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL),
					flags);
	}

	return 0;		/* never need to flush on unpin */
}

/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
	struct mmuext_op *op;
	struct multicall_space mcs;

	xen_mc_batch();

	mcs = __xen_mc_entry(sizeof(*op));

	op = mcs.args;
	op->cmd = MMUEXT_UNPIN_TABLE;
	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));

	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

	pgd_walk(pgd, unpin_page, TASK_SIZE);

	xen_mc_issue(0);
}

void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
	spin_lock(&next->page_table_lock);
	xen_pgd_pin(next->pgd);
	spin_unlock(&next->page_table_lock);
}

void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
	spin_lock(&mm->page_table_lock);
	xen_pgd_pin(mm->pgd);
	spin_unlock(&mm->page_table_lock);
}

void xen_exit_mmap(struct mm_struct *mm)
+1 −1
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);

void xen_pgd_pin(pgd_t *pgd);
void xen_pgd_unpin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);

#ifdef CONFIG_X86_PAE
unsigned long long xen_pte_val(pte_t);
+2 −0
Original line number Diff line number Diff line
@@ -20,6 +20,8 @@ unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
cycle_t xen_clocksource_read(void);

void xen_mark_init_mm_pinned(void);

DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);

static inline unsigned xen_get_lazy_mode(void)