Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 919c0d14 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
  KVM: Remove now unused structs from kvm_para.h
  x86: KVM guest: Use the paravirt clocksource structs and functions
  KVM: Make kvm host use the paravirt clocksource structs
  x86: Make xen use the paravirt clocksource structs and functions
  x86: Add structs and functions for paravirt clocksource
  KVM: VMX: Fix host msr corruption with preemption enabled
  KVM: ioapic: fix lost interrupt when changing a device's irq
  KVM: MMU: Fix oops on guest userspace access to guest pagetable
  KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend)
  KVM: MMU: Fix rmap_write_protect() hugepage iteration bug
  KVM: close timer injection race window in __vcpu_run
  KVM: Fix race between timer migration and vcpu migration
parents de08341a 6b1ed908
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -383,6 +383,7 @@ config VMI
config KVM_CLOCK
	bool "KVM paravirtualized clock"
	select PARAVIRT
	select PARAVIRT_CLOCK
	depends on !(X86_VISWS || X86_VOYAGER)
	help
	  Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
	  over full virtualization.  However, when run without a hypervisor
	  the kernel is theoretically slower and slightly larger.

config PARAVIRT_CLOCK
	bool
	default n

endif

config MEMTEST_BOOTPARAM
+1 −0
Original line number Diff line number Diff line
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST)		+= kvm.o
obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o

obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o

+33 −56
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@

#include <linux/clocksource.h>
#include <linux/kvm_para.h>
#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);

/* The hypervisor will put information about time periodically here */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
static struct pvclock_wall_clock wall_clock;

static inline u64 kvm_get_delta(u64 last_tsc)
{
	int cpu = smp_processor_id();
	u64 delta = native_read_tsc() - last_tsc;
	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
}

static struct kvm_wall_clock wall_clock;
static cycle_t kvm_clock_read(void);
/*
 * The wallclock is the time of day when we booted. Since then, some time may
 * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
 */
static unsigned long kvm_get_wallclock(void)
{
	u32 wc_sec, wc_nsec;
	u64 delta;
	struct pvclock_vcpu_time_info *vcpu_time;
	struct timespec ts;
	int version, nsec;
	int low, high;

	low = (int)__pa(&wall_clock);
	high = ((u64)__pa(&wall_clock) >> 32);
	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);

	delta = kvm_clock_read();
	vcpu_time = &get_cpu_var(hv_clock);
	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
	put_cpu_var(hv_clock);

	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
	do {
		version = wall_clock.wc_version;
		rmb();
		wc_sec = wall_clock.wc_sec;
		wc_nsec = wall_clock.wc_nsec;
		rmb();
	} while ((wall_clock.wc_version != version) || (version & 1));

	delta = kvm_clock_read() - delta;
	delta += wc_nsec;
	nsec = do_div(delta, NSEC_PER_SEC);
	set_normalized_timespec(&ts, wc_sec + delta, nsec);
	/*
	 * Of all mechanisms of time adjustment I've tested, this one
	 * was the champion!
	 */
	return ts.tv_sec + 1;
	return ts.tv_sec;
}

static int kvm_set_wallclock(unsigned long now)
{
	return 0;
	return -1;
}

/*
 * This is our read_clock function. The host puts an tsc timestamp each time
 * it updates a new time. Without the tsc adjustment, we can have a situation
 * in which a vcpu starts to run earlier (smaller system_time), but probes
 * time later (compared to another vcpu), leading to backwards time
 */
static cycle_t kvm_clock_read(void)
{
	u64 last_tsc, now;
	int cpu;
	struct pvclock_vcpu_time_info *src;
	cycle_t ret;

	preempt_disable();
	cpu = smp_processor_id();

	last_tsc = get_clock(cpu, tsc_timestamp);
	now = get_clock(cpu, system_time);

	now += kvm_get_delta(last_tsc);
	preempt_enable();

	return now;
	src = &get_cpu_var(hv_clock);
	ret = pvclock_clocksource_read(src);
	put_cpu_var(hv_clock);
	return ret;
}

static struct clocksource kvm_clock = {
	.name = "kvm-clock",
	.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};

static int kvm_register_clock(void)
static int kvm_register_clock(char *txt)
{
	int cpu = smp_processor_id();
	int low, high;
	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);

	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
	       cpu, high, low, txt);
	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}

@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
	 * Now that the first cpu already had this clocksource initialized,
	 * we shouldn't fail.
	 */
	WARN_ON(kvm_register_clock());
	WARN_ON(kvm_register_clock("secondary cpu clock"));
	/* ok, done with our trickery, call native */
	setup_secondary_APIC_clock();
}
#endif

#ifdef CONFIG_SMP
void __init kvm_smp_prepare_boot_cpu(void)
{
	WARN_ON(kvm_register_clock("primary cpu clock"));
	native_smp_prepare_boot_cpu();
}
#endif

/*
 * After the clock is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
		return;

	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
		if (kvm_register_clock())
		if (kvm_register_clock("boot clock"))
			return;
		pv_time_ops.get_wallclock = kvm_get_wallclock;
		pv_time_ops.set_wallclock = kvm_set_wallclock;
		pv_time_ops.sched_clock = kvm_clock_read;
#ifdef CONFIG_X86_LOCAL_APIC
		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
		machine_ops.shutdown  = kvm_shutdown;
#ifdef CONFIG_KEXEC
+141 −0
Original line number Diff line number Diff line
/*  paravirtual clock -- common code used by kvm/xen

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#include <linux/kernel.h>
#include <linux/percpu.h>
#include <asm/pvclock.h>

/*
 * These are perodically updated
 *    xen: magic shared_info page
 *    kvm: gpa registered via msr
 * and then copied here.
 */
struct pvclock_shadow_time {
	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
	u32 tsc_to_nsec_mul;
	int tsc_shift;
	u32 version;
};

/*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
 */
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
	u64 product;
#ifdef __i386__
	u32 tmp1, tmp2;
#endif

	if (shift < 0)
		delta >>= -shift;
	else
		delta <<= shift;

#ifdef __i386__
	__asm__ (
		"mul  %5       ; "
		"mov  %4,%%eax ; "
		"mov  %%edx,%4 ; "
		"mul  %5       ; "
		"xor  %5,%5    ; "
		"add  %4,%%eax ; "
		"adc  %5,%%edx ; "
		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
	__asm__ (
		"mul %%rdx ; shrd $32,%%rdx,%%rax"
		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif

	return product;
}

static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}

/*
 * Reads a consistent set of time-base values from hypervisor,
 * into a shadow data area.
 */
static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
					struct pvclock_vcpu_time_info *src)
{
	do {
		dst->version = src->version;
		rmb();		/* fetch version before data */
		dst->tsc_timestamp     = src->tsc_timestamp;
		dst->system_timestamp  = src->system_time;
		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
		dst->tsc_shift         = src->tsc_shift;
		rmb();		/* test version after fetching data */
	} while ((src->version & 1) || (dst->version != src->version));

	return dst->version;
}

cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
	struct pvclock_shadow_time shadow;
	unsigned version;
	cycle_t ret, offset;

	do {
		version = pvclock_get_time_values(&shadow, src);
		barrier();
		offset = pvclock_get_nsec_offset(&shadow);
		ret = shadow.system_timestamp + offset;
		barrier();
	} while (version != src->version);

	return ret;
}

void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
			    struct pvclock_vcpu_time_info *vcpu_time,
			    struct timespec *ts)
{
	u32 version;
	u64 delta;
	struct timespec now;

	/* get wallclock at system boot */
	do {
		version = wall_clock->version;
		rmb();		/* fetch version before time */
		now.tv_sec  = wall_clock->sec;
		now.tv_nsec = wall_clock->nsec;
		rmb();		/* fetch time before checking version */
	} while ((wall_clock->version & 1) || (version != wall_clock->version));

	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;

	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	now.tv_sec = delta;

	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+6 −3
Original line number Diff line number Diff line
@@ -200,10 +200,13 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)

	atomic_inc(&pt->pending);
	smp_mb__after_atomic_inc();
	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
	if (vcpu0) {
		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
		if (waitqueue_active(&vcpu0->wq)) {
			vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
			wake_up_interruptible(&vcpu0->wq);
		}
	}

	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
	pt->scheduled = ktime_to_ns(pt->timer.expires);
Loading