Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f87e4cac authored by Jeremy Fitzhardinge's avatar Jeremy Fitzhardinge Committed by Jeremy Fitzhardinge
Browse files

xen: SMP guest support



This is a fairly straightforward Xen implementation of smp_ops.

Xen has its own IPI mechanisms, and has no dependency on any
APIC-based IPI.  The smp_ops hooks and the flush_tlb_others pv_op
allow a Xen guest to avoid all APIC code in arch/i386 (the only apic
operation is a single apic_read for the apic version number).

One subtle point which needs to be addressed is unpinning pagetables
when another cpu may have a lazy tlb reference to the pagetable. Xen
will not allow an in-use pagetable to be unpinned, so we must find any
other cpus with a reference to the pagetable and get them to shoot
down their references.

Signed-off-by: default avatarJeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: default avatarChris Wright <chrisw@sous-sol.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andi Kleen <ak@suse.de>
parent ab550288
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@

config XEN
	bool "Enable support for Xen hypervisor"
	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES)
	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES)
	help
	  This is the Linux Xen port.  Enabling this will allow the
	  kernel to boot in a paravirtualized environment under the
+2 −0
Original line number Diff line number Diff line
obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
			events.o time.o

obj-$(CONFIG_SMP)	+= smp.o
+89 −26
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/smp.h>

#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
@@ -40,6 +41,7 @@
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>

#include "xen-ops.h"
#include "mmu.h"
@@ -56,7 +58,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);

static void xen_vcpu_setup(int cpu)
void xen_vcpu_setup(int cpu)
{
	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
@@ -347,23 +349,14 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
	}
}

/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
   hold a spinlock to protect the static traps[] array (static because
   it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct Xgt_desc_struct *desc)
static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
				  struct trap_info *traps)
{
	static DEFINE_SPINLOCK(lock);
	static struct trap_info traps[257];

	int cpu = smp_processor_id();
	unsigned in, out, count;

	per_cpu(idt_desc, cpu) = *desc;

	count = (desc->size+1) / 8;
	BUG_ON(count > 256);

	spin_lock(&lock);
	for (in = out = 0; in < count; in++) {
		const u32 *entry = (u32 *)(desc->address + in * 8);

@@ -371,6 +364,31 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
			out++;
	}
	traps[out].address = 0;
}

void xen_copy_trap_info(struct trap_info *traps)
{
	const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);

	xen_convert_trap_info(desc, traps);

	put_cpu_var(idt_desc);
}

/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
   hold a spinlock to protect the static traps[] array (static because
   it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct Xgt_desc_struct *desc)
{
	static DEFINE_SPINLOCK(lock);
	static struct trap_info traps[257];
	int cpu = smp_processor_id();

	per_cpu(idt_desc, cpu) = *desc;

	spin_lock(&lock);

	xen_convert_trap_info(desc, traps);

	xen_mc_flush();
	if (HYPERVISOR_set_trap_table(traps))
@@ -428,6 +446,12 @@ static unsigned long xen_apic_read(unsigned long reg)
{
	return 0;
}

static void xen_apic_write(unsigned long reg, unsigned long val)
{
	/* Warn to see if there's any stray references */
	WARN_ON(1);
}
#endif

static void xen_flush_tlb(void)
@@ -449,6 +473,40 @@ static void xen_flush_tlb_single(unsigned long addr)
		BUG();
}

static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va)
{
	struct mmuext_op op;
	cpumask_t cpumask = *cpus;

	/*
	 * A couple of (to be removed) sanity checks:
	 *
	 * - current CPU must not be in mask
	 * - mask must exist :)
	 */
	BUG_ON(cpus_empty(cpumask));
	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
	BUG_ON(!mm);

	/* If a CPU which we ran on has gone down, OK. */
	cpus_and(cpumask, cpumask, cpu_online_map);
	if (cpus_empty(cpumask))
		return;

	if (va == TLB_FLUSH_ALL) {
		op.cmd = MMUEXT_TLB_FLUSH_MULTI;
		op.arg2.vcpumask = (void *)cpus;
	} else {
		op.cmd = MMUEXT_INVLPG_MULTI;
		op.arg1.linear_addr = va;
		op.arg2.vcpumask = (void *)cpus;
	}

	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
		BUG();
}

static unsigned long xen_read_cr2(void)
{
	return x86_read_percpu(xen_vcpu)->arch.cr2;
@@ -460,18 +518,6 @@ static void xen_write_cr4(unsigned long cr4)
	native_write_cr4(cr4 & ~X86_CR4_TSD);
}

/*
 * Page-directory addresses above 4GB do not fit into architectural %cr3.
 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
 * must use the following accessor macros to pack/unpack valid MFNs.
 *
 * Note that Xen is using the fact that the pagetable base is always
 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
 * of cr3.
 */
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))

static unsigned long xen_read_cr3(void)
{
	return x86_read_percpu(xen_cr3);
@@ -740,8 +786,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
	.io_delay = xen_io_delay,

#ifdef CONFIG_X86_LOCAL_APIC
	.apic_write = paravirt_nop,
	.apic_write_atomic = paravirt_nop,
	.apic_write = xen_apic_write,
	.apic_write_atomic = xen_apic_write,
	.apic_read = xen_apic_read,
	.setup_boot_clock = paravirt_nop,
	.setup_secondary_clock = paravirt_nop,
@@ -751,6 +797,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
	.flush_tlb_user = xen_flush_tlb,
	.flush_tlb_kernel = xen_flush_tlb,
	.flush_tlb_single = xen_flush_tlb_single,
	.flush_tlb_others = xen_flush_tlb_others,

	.pte_update = paravirt_nop,
	.pte_update_defer = paravirt_nop,
@@ -796,6 +843,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
	.set_lazy_mode = xen_set_lazy_mode,
};

#ifdef CONFIG_SMP
static const struct smp_ops xen_smp_ops __initdata = {
	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
	.smp_prepare_cpus = xen_smp_prepare_cpus,
	.cpu_up = xen_cpu_up,
	.smp_cpus_done = xen_smp_cpus_done,

	.smp_send_stop = xen_smp_send_stop,
	.smp_send_reschedule = xen_smp_send_reschedule,
	.smp_call_function_mask = xen_smp_call_function_mask,
};
#endif	/* CONFIG_SMP */

/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -808,6 +868,9 @@ asmlinkage void __init xen_start_kernel(void)

	/* Install Xen paravirt ops */
	paravirt_ops = xen_paravirt_ops;
#ifdef CONFIG_SMP
	smp_ops = xen_smp_ops;
#endif

	xen_setup_features();

+79 −1
Original line number Diff line number Diff line
@@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};

/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};

/* Packed IRQ information: binding type, sub-type index, and event channel. */
struct packed_irq
{
@@ -58,7 +61,13 @@ struct packed_irq
static struct packed_irq irq_info[NR_IRQS];

/* Binding types. */
enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
enum {
	IRQT_UNBOUND,
	IRQT_PIRQ,
	IRQT_VIRQ,
	IRQT_IPI,
	IRQT_EVTCHN
};

/* Convenient shorthand for packed representation of an unbound IRQ. */
#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
@@ -261,6 +270,45 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
	return irq;
}

static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
	struct evtchn_bind_ipi bind_ipi;
	int evtchn, irq;

	spin_lock(&irq_mapping_update_lock);

	irq = per_cpu(ipi_to_irq, cpu)[ipi];
	if (irq == -1) {
		irq = find_unbound_irq();
		if (irq < 0)
			goto out;

		dynamic_irq_init(irq);
		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
					      handle_level_irq, "ipi");

		bind_ipi.vcpu = cpu;
		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
						&bind_ipi) != 0)
			BUG();
		evtchn = bind_ipi.port;

		evtchn_to_irq[evtchn] = irq;
		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);

		per_cpu(ipi_to_irq, cpu)[ipi] = irq;

		bind_evtchn_to_cpu(evtchn, cpu);
	}

	irq_bindcount[irq]++;

 out:
	spin_unlock(&irq_mapping_update_lock);
	return irq;
}


static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
	struct evtchn_bind_virq bind_virq;
@@ -369,6 +417,28 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
}
EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);

int bind_ipi_to_irqhandler(enum ipi_vector ipi,
			   unsigned int cpu,
			   irq_handler_t handler,
			   unsigned long irqflags,
			   const char *devname,
			   void *dev_id)
{
	int irq, retval;

	irq = bind_ipi_to_irq(ipi, cpu);
	if (irq < 0)
		return irq;

	retval = request_irq(irq, handler, irqflags, devname, dev_id);
	if (retval != 0) {
		unbind_from_irq(irq);
		return retval;
	}

	return irq;
}

void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
	free_irq(irq, dev_id);
@@ -376,6 +446,14 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);

void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
	int irq = per_cpu(ipi_to_irq, cpu)[vector];
	BUG_ON(irq < 0);
	notify_remote_via_irq(irq);
}


/*
 * Search the CPUs pending events bitmasks.  For each one found, map
 * the event number to an irq, and feed it into do_IRQ() for
+52 −17
Original line number Diff line number Diff line
@@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd)

	xen_mc_batch();

	if (pgd_walk(pgd, pin_page, TASK_SIZE))
	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		/* re-enable interrupts for kmap_flush_unused */
		xen_mc_issue(0);
		kmap_flush_unused();
		xen_mc_batch();
	}

	mcs = __xen_mc_entry(sizeof(*op));
	op = mcs.args;
@@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
	spin_unlock(&mm->page_table_lock);
}

void xen_exit_mmap(struct mm_struct *mm)
{
	struct task_struct *tsk = current;

	task_lock(tsk);
#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
   we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
	struct mm_struct *mm = info;

	/*
	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
	 */
	if (tsk->active_mm == mm) {
		tsk->active_mm = &init_mm;
		atomic_inc(&init_mm.mm_count);
	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		leave_mm(smp_processor_id());
}

		switch_mm(mm, &init_mm, tsk);
static void drop_mm_ref(struct mm_struct *mm)
{
	if (current->active_mm == mm) {
		if (current->mm == mm)
			load_cr3(swapper_pg_dir);
		else
			leave_mm(smp_processor_id());
	}

		atomic_dec(&mm->mm_count);
		BUG_ON(atomic_read(&mm->mm_count) == 0);
	if (!cpus_empty(mm->cpu_vm_mask))
		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
					   mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
	if (current->active_mm == mm)
		load_cr3(swapper_pg_dir);
}
#endif

	task_unlock(tsk);
/*
 * While a process runs, Xen pins its pagetables, which means that the
 * hypervisor forces it to be read-only, and it controls all updates
 * to it.  This means that all pagetable updates have to go via the
 * hypervisor, which is moderately expensive.
 *
 * Since we're pulling the pagetable down, we switch to use init_mm,
 * unpin old process pagetable and mark it all read-write, which
 * allows further operations on it to be simple memory accesses.
 *
 * The only subtle point is that another CPU may be still using the
 * pagetable because of lazy tlb flushing.  This means we need need to
 * switch all CPUs off this pagetable before we can unpin it.
 */
void xen_exit_mmap(struct mm_struct *mm)
{
	get_cpu();		/* make sure we don't move around */
	drop_mm_ref(mm);
	put_cpu();

	xen_pgd_unpin(mm->pgd);
}
Loading