xen: SMP guest support (f87e4cac) · Commits · e / devices / android_kernel_fairphone_FP4

arch/i386/xen/Kconfig

+1 −1

Original line number	Diff line number	Diff line
		@@ -4,7 +4,7 @@

		config XEN
		bool "Enable support for Xen hypervisor"
		depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT \|\| SMP \|\| NEED_MULTIPLE_NODES)
		depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT \|\| NEED_MULTIPLE_NODES)
		help
		This is the Linux Xen port. Enabling this will allow the
		kernel to boot in a paravirtualized environment under the

arch/i386/xen/Makefile

+2 −0

Original line number	Diff line number	Diff line
		obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
		events.o time.o

		obj-$(CONFIG_SMP) += smp.o

arch/i386/xen/enlighten.c

+89 −26

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@
		#include <linux/mm.h>
		#include <linux/page-flags.h>
		#include <linux/highmem.h>
		#include <linux/smp.h>

		#include <xen/interface/xen.h>
		#include <xen/interface/physdev.h>
		@@ -40,6 +41,7 @@
		#include <asm/setup.h>
		#include <asm/desc.h>
		#include <asm/pgtable.h>
		#include <asm/tlbflush.h>

		#include "xen-ops.h"
		#include "mmu.h"
		@@ -56,7 +58,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
		struct start_info *xen_start_info;
		EXPORT_SYMBOL_GPL(xen_start_info);

		static void xen_vcpu_setup(int cpu)
		void xen_vcpu_setup(int cpu)
		{
		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
		}
		@@ -347,23 +349,14 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
		}
		}

		/* Load a new IDT into Xen. In principle this can be per-CPU, so we
		hold a spinlock to protect the static traps[] array (static because
		it avoids allocation, and saves stack space). */
		static void xen_load_idt(const struct Xgt_desc_struct *desc)
		static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
		struct trap_info *traps)
		{
		static DEFINE_SPINLOCK(lock);
		static struct trap_info traps[257];

		int cpu = smp_processor_id();
		unsigned in, out, count;

		per_cpu(idt_desc, cpu) = *desc;

		count = (desc->size+1) / 8;
		BUG_ON(count > 256);

		spin_lock(&lock);
		for (in = out = 0; in < count; in++) {
		const u32 entry = (u32 )(desc->address + in * 8);

		@@ -371,6 +364,31 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
		out++;
		}
		traps[out].address = 0;
		}

		void xen_copy_trap_info(struct trap_info *traps)
		{
		const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);

		xen_convert_trap_info(desc, traps);

		put_cpu_var(idt_desc);
		}

		/* Load a new IDT into Xen. In principle this can be per-CPU, so we
		hold a spinlock to protect the static traps[] array (static because
		it avoids allocation, and saves stack space). */
		static void xen_load_idt(const struct Xgt_desc_struct *desc)
		{
		static DEFINE_SPINLOCK(lock);
		static struct trap_info traps[257];
		int cpu = smp_processor_id();

		per_cpu(idt_desc, cpu) = *desc;

		spin_lock(&lock);

		xen_convert_trap_info(desc, traps);

		xen_mc_flush();
		if (HYPERVISOR_set_trap_table(traps))
		@@ -428,6 +446,12 @@ static unsigned long xen_apic_read(unsigned long reg)
		{
		return 0;
		}

		static void xen_apic_write(unsigned long reg, unsigned long val)
		{
		/* Warn to see if there's any stray references */
		WARN_ON(1);
		}
		#endif

		static void xen_flush_tlb(void)
		@@ -449,6 +473,40 @@ static void xen_flush_tlb_single(unsigned long addr)
		BUG();
		}

		static void xen_flush_tlb_others(const cpumask_t cpus, struct mm_struct mm,
		unsigned long va)
		{
		struct mmuext_op op;
		cpumask_t cpumask = *cpus;

		/*
		* A couple of (to be removed) sanity checks:
		*
		* - current CPU must not be in mask
		* - mask must exist :)
		*/
		BUG_ON(cpus_empty(cpumask));
		BUG_ON(cpu_isset(smp_processor_id(), cpumask));
		BUG_ON(!mm);

		/* If a CPU which we ran on has gone down, OK. */
		cpus_and(cpumask, cpumask, cpu_online_map);
		if (cpus_empty(cpumask))
		return;

		if (va == TLB_FLUSH_ALL) {
		op.cmd = MMUEXT_TLB_FLUSH_MULTI;
		op.arg2.vcpumask = (void *)cpus;
		} else {
		op.cmd = MMUEXT_INVLPG_MULTI;
		op.arg1.linear_addr = va;
		op.arg2.vcpumask = (void *)cpus;
		}

		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
		BUG();
		}

		static unsigned long xen_read_cr2(void)
		{
		return x86_read_percpu(xen_vcpu)->arch.cr2;
		@@ -460,18 +518,6 @@ static void xen_write_cr4(unsigned long cr4)
		native_write_cr4(cr4 & ~X86_CR4_TSD);
		}

		/*
		* Page-directory addresses above 4GB do not fit into architectural %cr3.
		* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
		* must use the following accessor macros to pack/unpack valid MFNs.
		*
		* Note that Xen is using the fact that the pagetable base is always
		* page-aligned, and putting the 12 MSB of the address into the 12 LSB
		* of cr3.
		*/
		#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) \| ((unsigned)(pfn) >> 20))
		#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) \| ((unsigned)(cr3) << 20))

		static unsigned long xen_read_cr3(void)
		{
		return x86_read_percpu(xen_cr3);
		@@ -740,8 +786,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
		.io_delay = xen_io_delay,

		#ifdef CONFIG_X86_LOCAL_APIC
		.apic_write = paravirt_nop,
		.apic_write_atomic = paravirt_nop,
		.apic_write = xen_apic_write,
		.apic_write_atomic = xen_apic_write,
		.apic_read = xen_apic_read,
		.setup_boot_clock = paravirt_nop,
		.setup_secondary_clock = paravirt_nop,
		@@ -751,6 +797,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
		.flush_tlb_user = xen_flush_tlb,
		.flush_tlb_kernel = xen_flush_tlb,
		.flush_tlb_single = xen_flush_tlb_single,
		.flush_tlb_others = xen_flush_tlb_others,

		.pte_update = paravirt_nop,
		.pte_update_defer = paravirt_nop,
		@@ -796,6 +843,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
		.set_lazy_mode = xen_set_lazy_mode,
		};

		#ifdef CONFIG_SMP
		static const struct smp_ops xen_smp_ops __initdata = {
		.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
		.smp_prepare_cpus = xen_smp_prepare_cpus,
		.cpu_up = xen_cpu_up,
		.smp_cpus_done = xen_smp_cpus_done,

		.smp_send_stop = xen_smp_send_stop,
		.smp_send_reschedule = xen_smp_send_reschedule,
		.smp_call_function_mask = xen_smp_call_function_mask,
		};
		#endif /* CONFIG_SMP */

		/* First C function to be called on Xen boot */
		asmlinkage void __init xen_start_kernel(void)
		{
		@@ -808,6 +868,9 @@ asmlinkage void __init xen_start_kernel(void)

		/* Install Xen paravirt ops */
		paravirt_ops = xen_paravirt_ops;
		#ifdef CONFIG_SMP
		smp_ops = xen_smp_ops;
		#endif

		xen_setup_features();

arch/i386/xen/events.c

+79 −1

Original line number	Diff line number	Diff line
		@@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock);
		/* IRQ <-> VIRQ mapping. */
		static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};

		/* IRQ <-> IPI mapping */
		static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};

		/* Packed IRQ information: binding type, sub-type index, and event channel. */
		struct packed_irq
		{
		@@ -58,7 +61,13 @@ struct packed_irq
		static struct packed_irq irq_info[NR_IRQS];

		/* Binding types. */
		enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
		enum {
		IRQT_UNBOUND,
		IRQT_PIRQ,
		IRQT_VIRQ,
		IRQT_IPI,
		IRQT_EVTCHN
		};

		/* Convenient shorthand for packed representation of an unbound IRQ. */
		#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
		@@ -261,6 +270,45 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
		return irq;
		}

		static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
		{
		struct evtchn_bind_ipi bind_ipi;
		int evtchn, irq;

		spin_lock(&irq_mapping_update_lock);

		irq = per_cpu(ipi_to_irq, cpu)[ipi];
		if (irq == -1) {
		irq = find_unbound_irq();
		if (irq < 0)
		goto out;

		dynamic_irq_init(irq);
		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
		handle_level_irq, "ipi");

		bind_ipi.vcpu = cpu;
		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
		&bind_ipi) != 0)
		BUG();
		evtchn = bind_ipi.port;

		evtchn_to_irq[evtchn] = irq;
		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);

		per_cpu(ipi_to_irq, cpu)[ipi] = irq;

		bind_evtchn_to_cpu(evtchn, cpu);
		}

		irq_bindcount[irq]++;

		out:
		spin_unlock(&irq_mapping_update_lock);
		return irq;
		}


		static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
		{
		struct evtchn_bind_virq bind_virq;
		@@ -369,6 +417,28 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
		}
		EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);

		int bind_ipi_to_irqhandler(enum ipi_vector ipi,
		unsigned int cpu,
		irq_handler_t handler,
		unsigned long irqflags,
		const char *devname,
		void *dev_id)
		{
		int irq, retval;

		irq = bind_ipi_to_irq(ipi, cpu);
		if (irq < 0)
		return irq;

		retval = request_irq(irq, handler, irqflags, devname, dev_id);
		if (retval != 0) {
		unbind_from_irq(irq);
		return retval;
		}

		return irq;
		}

		void unbind_from_irqhandler(unsigned int irq, void *dev_id)
		{
		free_irq(irq, dev_id);
		@@ -376,6 +446,14 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)
		}
		EXPORT_SYMBOL_GPL(unbind_from_irqhandler);

		void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
		{
		int irq = per_cpu(ipi_to_irq, cpu)[vector];
		BUG_ON(irq < 0);
		notify_remote_via_irq(irq);
		}


		/*
		* Search the CPUs pending events bitmasks. For each one found, map
		* the event number to an irq, and feed it into do_IRQ() for

arch/i386/xen/mmu.c

+52 −17

Original line number	Diff line number	Diff line
		@@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd)

		xen_mc_batch();

		if (pgd_walk(pgd, pin_page, TASK_SIZE))
		if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		/* re-enable interrupts for kmap_flush_unused */
		xen_mc_issue(0);
		kmap_flush_unused();
		xen_mc_batch();
		}

		mcs = __xen_mc_entry(sizeof(*op));
		op = mcs.args;
		@@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
		spin_unlock(&mm->page_table_lock);
		}

		void xen_exit_mmap(struct mm_struct *mm)
		{
		struct task_struct *tsk = current;

		task_lock(tsk);
		#ifdef CONFIG_SMP
		/* Another cpu may still have their %cr3 pointing at the pagetable, so
		we need to repoint it somewhere else before we can unpin it. */
		static void drop_other_mm_ref(void *info)
		{
		struct mm_struct *mm = info;

		/*
		* We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
		* much faster this way, as no tlb flushes means bigger wrpt batches.
		*/
		if (tsk->active_mm == mm) {
		tsk->active_mm = &init_mm;
		atomic_inc(&init_mm.mm_count);
		if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		leave_mm(smp_processor_id());
		}

		switch_mm(mm, &init_mm, tsk);
		static void drop_mm_ref(struct mm_struct *mm)
		{
		if (current->active_mm == mm) {
		if (current->mm == mm)
		load_cr3(swapper_pg_dir);
		else
		leave_mm(smp_processor_id());
		}

		atomic_dec(&mm->mm_count);
		BUG_ON(atomic_read(&mm->mm_count) == 0);
		if (!cpus_empty(mm->cpu_vm_mask))
		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
		mm, 1);
		}
		#else
		static void drop_mm_ref(struct mm_struct *mm)
		{
		if (current->active_mm == mm)
		load_cr3(swapper_pg_dir);
		}
		#endif

		task_unlock(tsk);
		/*
		* While a process runs, Xen pins its pagetables, which means that the
		* hypervisor forces it to be read-only, and it controls all updates
		* to it. This means that all pagetable updates have to go via the
		* hypervisor, which is moderately expensive.
		*
		* Since we're pulling the pagetable down, we switch to use init_mm,
		* unpin old process pagetable and mark it all read-write, which
		* allows further operations on it to be simple memory accesses.
		*
		* The only subtle point is that another CPU may be still using the
		* pagetable because of lazy tlb flushing. This means we need need to
		* switch all CPUs off this pagetable before we can unpin it.
		*/
		void xen_exit_mmap(struct mm_struct *mm)
		{
		get_cpu(); /* make sure we don't move around */
		drop_mm_ref(mm);
		put_cpu();

		xen_pgd_unpin(mm->pgd);
		}