Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 62c4d9af authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'stable/for-linus-3.6-rc0-tag' of...

Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

Pull Xen update from Konrad Rzeszutek Wilk:
 "Features:
   * Performance improvement to lower the amount of traps the hypervisor
     has to do 32-bit guests.  Mainly for setting PTE entries and
     updating TLS descriptors.
   * MCE polling driver to collect hypervisor MCE buffer and present
     them to /dev/mcelog.
   * Physical CPU online/offline support.  When an privileged guest is
     booted it is present with virtual CPUs, which might have an 1:1 to
     physical CPUs but usually don't.  This provides mechanism to
     offline/online physical CPUs.
  Bug-fixes for:
   * Coverity found fixes in the console and ACPI processor driver.
   * PVonHVM kexec fixes along with some cleanups.
   * Pages that fall within E820 gaps and non-RAM regions (and had been
     released to hypervisor) would be populated back, but potentially in
     non-RAM regions."

* tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
  xen: populate correct number of pages when across mem boundary (v2)
  xen PVonHVM: move shared_info to MMIO before kexec
  xen: simplify init_hvm_pv_info
  xen: remove cast from HYPERVISOR_shared_info assignment
  xen: enable platform-pci only in a Xen guest
  xen/pv-on-hvm kexec: shutdown watches from old kernel
  xen/x86: avoid updating TLS descriptors if they haven't changed
  xen/x86: add desc_equal() to compare GDT descriptors
  xen/mm: zero PTEs for non-present MFNs in the initial page table
  xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable
  xen/hvc: Fix up checks when the info is allocated.
  xen/acpi: Fix potential memory leak.
  xen/mce: add .poll method for mcelog device driver
  xen/mce: schedule a workqueue to avoid sleep in atomic context
  xen/pcpu: Xen physical cpus online/offline sys interface
  xen/mce: Register native mce handler as vMCE bounce back point
  x86, MCE, AMD: Adjust initcall sequence for xen
  xen/mce: Add mcelog support for Xen platform
parents 5fecc9d8 c3d93f88
Loading
Loading
Loading
Loading
+20 −0
Original line number Diff line number Diff line
What:		/sys/devices/system/xen_cpu/
Date:		May 2012
Contact:	Liu, Jinsong <jinsong.liu@intel.com>
Description:
		A collection of global/individual Xen physical cpu attributes

		Individual physical cpu attributes are contained in
		subdirectories named by the Xen's logical cpu number, e.g.:
		/sys/devices/system/xen_cpu/xen_cpu#/


What:		/sys/devices/system/xen_cpu/xen_cpu#/online
Date:		May 2012
Contact:	Liu, Jinsong <jinsong.liu@intel.com>
Description:
		Interface to online/offline Xen physical cpus

		When running under Xen platform, it provide user interface
		to online/offline physical cpus, except cpu0 due to several
		logic restrictions and assumptions.
+8 −0
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
#include <xen/interface/platform.h>
#include <xen/interface/xen-mca.h>

/*
 * The hypercall asms have to meet several constraints:
@@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
	return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
}

static inline int
HYPERVISOR_mca(struct xen_mc *mc_op)
{
	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
	return _hypercall1(int, mca, mc_op);
}

static inline int
HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
{
+1 −3
Original line number Diff line number Diff line
@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);

int mce_disabled __read_mostly;

#define MISC_MCELOG_MINOR	227

#define SPINUNIT 100	/* 100ns */

atomic_t mce_entry;
@@ -2346,7 +2344,7 @@ static __init int mcheck_init_device(void)

	return err;
}
device_initcall(mcheck_init_device);
device_initcall_sync(mcheck_init_device);

/*
 * Old style boot options parsing. Only for compatibility.
+21 −1
Original line number Diff line number Diff line
@@ -759,4 +759,24 @@ static __init int threshold_init_device(void)

	return 0;
}
device_initcall(threshold_init_device);
/*
 * there are 3 funcs which need to be _initcalled in a logic sequence:
 * 1. xen_late_init_mcelog
 * 2. mcheck_init_device
 * 3. threshold_init_device
 *
 * xen_late_init_mcelog must register xen_mce_chrdev_device before
 * native mce_chrdev_device registration if running under xen platform;
 *
 * mcheck_init_device should be inited before threshold_init_device to
 * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
 *
 * so we use following _initcalls
 * 1. device_initcall(xen_late_init_mcelog);
 * 2. device_initcall_sync(mcheck_init_device);
 * 3. late_initcall(threshold_init_device);
 *
 * when running under xen, the initcall order is 1,2,3;
 * on baremetal, we skip 1 and we do only 2 and 3.
 */
late_initcall(threshold_init_device);
+173 −51
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/syscore_ops.h>

#include <xen/xen.h>
#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
 */
struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;

/*
 * Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 */
static int have_vcpu_info_placement = 1;

struct tls_descs {
	struct desc_struct desc[3];
};

/*
 * Updating the 3 TLS descriptors in the GDT on every task switch is
 * surprisingly expensive so we avoid updating them if they haven't
 * changed.  Since Xen writes different descriptors than the one
 * passed in the update_descriptor hypercall we keep shadow copies to
 * compare against.
 */
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);

static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
	unsigned int xsave_mask;

	cpuid_leaf1_edx_mask =
		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */

	if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
		BUG();
}

static inline bool desc_equal(const struct desc_struct *d1,
			      const struct desc_struct *d2)
{
	return d1->a == d2->a && d1->b == d2->b;
}

static void load_TLS_descriptor(struct thread_struct *t,
				unsigned int cpu, unsigned int i)
{
	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
	struct multicall_space mc = __xen_mc_entry(0);
	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
	struct desc_struct *gdt;
	xmaddr_t maddr;
	struct multicall_space mc;

	if (desc_equal(shadow, &t->tls_array[i]))
		return;

	*shadow = t->tls_array[i];

	gdt = get_cpu_gdt_table(cpu);
	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
	mc = __xen_mc_entry(0);

	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
	/*
	 * Look for known traps using IST, and substitute them
	 * appropriately.  The debugger ones are the only ones we care
	 * about.  Xen will handle faults like double_fault and
	 * machine_check, so we should never see them.  Warn if
	 * about.  Xen will handle faults like double_fault,
	 * so we should never see them.  Warn if
	 * there's an unexpected IST-using fault handler.
	 */
	if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
		return 0;
#ifdef CONFIG_X86_MCE
	} else if (addr == (unsigned long)machine_check) {
		return 0;
		/*
		 * when xen hypervisor inject vMCE to guest,
		 * use native mce handler to handle it
		 */
		;
#endif
	} else {
		/* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}

static int init_hvm_pv_info(int *major, int *minor)
{
	uint32_t eax, ebx, ecx, edx, pages, msr, base;
	u64 pfn;

	base = xen_cpuid_base();
	cpuid(base + 1, &eax, &ebx, &ecx, &edx);

	*major = eax >> 16;
	*minor = eax & 0xffff;
	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);

	cpuid(base + 2, &pages, &msr, &ecx, &edx);

	pfn = __pa(hypercall_page);
	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));

	xen_setup_features();

	pv_info.name = "Xen HVM";

	xen_domain_type = XEN_HVM_DOMAIN;
#ifdef CONFIG_XEN_PVHVM
/*
 * The pfn containing the shared_info is located somewhere in RAM. This
 * will cause trouble if the current kernel is doing a kexec boot into a
 * new kernel. The new kernel (and its startup code) can not know where
 * the pfn is, so it can not reserve the page. The hypervisor will
 * continue to update the pfn, and as a result memory corruption occours
 * in the new kernel.
 *
 * One way to work around this issue is to allocate a page in the
 * xen-platform pci device's BAR memory range. But pci init is done very
 * late and the shared_info page is already in use very early to read
 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
 * code paths on other vcpus could access the pfn during the small
 * window when the old pfn is moved to the new pfn. There is even a
 * small window were the old pfn is not backed by a mfn, and during that
 * time all reads return -1.
 *
 * Because it is not known upfront where the MMIO region is located it
 * can not be used right from the start in xen_hvm_init_shared_info.
 *
 * To minimise trouble the move of the pfn is done shortly before kexec.
 * This does not eliminate the race because all vcpus are still online
 * when the syscore_ops will be called. But hopefully there is no work
 * pending at this point in time. Also the syscore_op is run last which
 * reduces the risk further.
 */

	return 0;
}
static struct shared_info *xen_hvm_shared_info;

void __ref xen_hvm_init_shared_info(void)
static void xen_hvm_connect_shared_info(unsigned long pfn)
{
	int cpu;
	struct xen_add_to_physmap xatp;
	static struct shared_info *shared_info_page = 0;

	if (!shared_info_page)
		shared_info_page = (struct shared_info *)
			extend_brk(PAGE_SIZE, PAGE_SIZE);
	xatp.domid = DOMID_SELF;
	xatp.idx = 0;
	xatp.space = XENMAPSPACE_shared_info;
	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
	xatp.gpfn = pfn;
	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
		BUG();

	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
}
static void xen_hvm_set_shared_info(struct shared_info *sip)
{
	int cpu;

	HYPERVISOR_shared_info = sip;

	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
	 * page, we use it in the event channel upcall and in some pvclock
	 * related functions. We don't need the vcpu_info placement
	 * optimizations because we don't use any pv_mmu or pv_irq op on
	 * HVM.
	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
	 * online but xen_hvm_init_shared_info is run at resume time too and
	 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
	 * online but xen_hvm_set_shared_info is run at resume time too and
	 * in that case multiple vcpus might be online. */
	for_each_online_cpu(cpu) {
		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
	}
}

#ifdef CONFIG_XEN_PVHVM
/* Reconnect the shared_info pfn to a mfn */
void xen_hvm_resume_shared_info(void)
{
	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
}

#ifdef CONFIG_KEXEC
static struct shared_info *xen_hvm_shared_info_kexec;
static unsigned long xen_hvm_shared_info_pfn_kexec;

/* Remember a pfn in MMIO space for kexec reboot */
void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
{
	xen_hvm_shared_info_kexec = sip;
	xen_hvm_shared_info_pfn_kexec = pfn;
}

static void xen_hvm_syscore_shutdown(void)
{
	struct xen_memory_reservation reservation = {
		.domid = DOMID_SELF,
		.nr_extents = 1,
	};
	unsigned long prev_pfn;
	int rc;

	if (!xen_hvm_shared_info_kexec)
		return;

	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
	set_xen_guest_handle(reservation.extent_start, &prev_pfn);

	/* Move pfn to MMIO, disconnects previous pfn from mfn */
	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);

	/* Update pointers, following hypercall is also a memory barrier */
	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);

	/* Allocate new mfn for previous pfn */
	do {
		rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
		if (rc == 0)
			msleep(123);
	} while (rc == 0);

	/* Make sure the previous pfn is really connected to a (new) mfn */
	BUG_ON(rc != 1);
}

static struct syscore_ops xen_hvm_syscore_ops = {
	.shutdown = xen_hvm_syscore_shutdown,
};
#endif

/* Use a pfn in RAM, may move to MMIO before kexec. */
static void __init xen_hvm_init_shared_info(void)
{
	/* Remember pointer for resume */
	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
	xen_hvm_set_shared_info(xen_hvm_shared_info);
}

static void __init init_hvm_pv_info(void)
{
	int major, minor;
	uint32_t eax, ebx, ecx, edx, pages, msr, base;
	u64 pfn;

	base = xen_cpuid_base();
	cpuid(base + 1, &eax, &ebx, &ecx, &edx);

	major = eax >> 16;
	minor = eax & 0xffff;
	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);

	cpuid(base + 2, &pages, &msr, &ecx, &edx);

	pfn = __pa(hypercall_page);
	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));

	xen_setup_features();

	pv_info.name = "Xen HVM";

	xen_domain_type = XEN_HVM_DOMAIN;
}

static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
				    unsigned long action, void *hcpu)
{
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {

static void __init xen_hvm_guest_init(void)
{
	int r;
	int major, minor;

	r = init_hvm_pv_info(&major, &minor);
	if (r < 0)
		return;
	init_hvm_pv_info();

	xen_hvm_init_shared_info();
#ifdef CONFIG_KEXEC
	register_syscore_ops(&xen_hvm_syscore_ops);
#endif

	if (xen_feature(XENFEAT_hvm_callback_vector))
		xen_have_vector_callback = 1;
Loading