Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9dbbe3cf authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm changes from Paolo Bonzini:
 "Remove from guest code the handling of task migration during a pvclock
  read; instead use the correct protocol in KVM.

  This removes the need for task migration notifiers in core scheduler
  code"

[ The scheduler people really hated the migration notifiers, so this was
  kind of required  - Linus ]

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  x86: pvclock: Really remove the sched notifier for cross-cpu migrations
  kvm: x86: fix kvmclock update protocol
parents fb45f493 73459e2a
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,

struct pvclock_vsyscall_time_info {
	struct pvclock_vcpu_time_info pvti;
	u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));

#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+0 −44
Original line number Diff line number Diff line
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}

static struct pvclock_vsyscall_time_info *pvclock_vdso_info;

static struct pvclock_vsyscall_time_info *
pvclock_get_vsyscall_user_time_info(int cpu)
{
	if (!pvclock_vdso_info) {
		BUG();
		return NULL;
	}

	return &pvclock_vdso_info[cpu];
}

struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
{
	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
}

#ifdef CONFIG_X86_64
static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
			        void *v)
{
	struct task_migration_notifier *mn = v;
	struct pvclock_vsyscall_time_info *pvti;

	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);

	/* this is NULL when pvclock vsyscall is not initialized */
	if (unlikely(pvti == NULL))
		return NOTIFY_DONE;

	pvti->migrate_count++;

	return NOTIFY_DONE;
}

static struct notifier_block pvclock_migrate = {
	.notifier_call = pvclock_task_migrate,
};

/*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,

	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);

	pvclock_vdso_info = i;

	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
			     __pa(i) + (idx*PAGE_SIZE),
			     PAGE_KERNEL_VVAR);
	}


	register_task_migration_notifier(&pvclock_migrate);

	return 0;
}
#endif
+28 −5
Original line number Diff line number Diff line
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
		&guest_hv_clock, sizeof(guest_hv_clock))))
		return 0;

	/*
	 * The interface expects us to write an even number signaling that the
	 * update is finished. Since the guest won't see the intermediate
	 * state, we just increase by 2 at the end.
	/* This VCPU is paused, but it's legal for a guest to read another
	 * VCPU's kvmclock, so we really have to follow the specification where
	 * it says that version is odd if data is being modified, and even after
	 * it is consistent.
	 *
	 * Version field updates must be kept separate.  This is because
	 * kvm_write_guest_cached might use a "rep movs" instruction, and
	 * writes within a string instruction are weakly ordered.  So there
	 * are three writes overall.
	 *
	 * As a small optimization, only write the version field in the first
	 * and third write.  The vcpu->pv_time cache is still valid, because the
	 * version field is the first in the struct.
	 */
	vcpu->hv_clock.version = guest_hv_clock.version + 2;
	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);

	vcpu->hv_clock.version = guest_hv_clock.version + 1;
	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock.version));

	smp_wmb();

	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock));

	smp_wmb();

	vcpu->hv_clock.version++;
	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock.version));
	return 0;
}

+15 −19
Original line number Diff line number Diff line
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
	cycle_t ret;
	u64 last;
	u32 version;
	u32 migrate_count;
	u8 flags;
	unsigned cpu, cpu1;


	/*
	 * When looping to get a consistent (time-info, tsc) pair, we
	 * also need to deal with the possibility we can switch vcpus,
	 * so make sure we always re-fetch time-info for the current vcpu.
	 * Note: hypervisor must guarantee that:
	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
	 * 2. that per-CPU pvclock time info is updated if the
	 *    underlying CPU changes.
	 * 3. that version is increased whenever underlying CPU
	 *    changes.
	 *
	 */
	do {
		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
		 * __getcpu() calls (Gleb).
		 */

		/* Make sure migrate_count will change if we leave the VCPU. */
		do {
		pvti = get_pvti(cpu);
			migrate_count = pvti->migrate_count;

			cpu1 = cpu;
			cpu = __getcpu() & VGETCPU_CPU_MASK;
		} while (unlikely(cpu != cpu1));

		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);

		/*
		 * Test we're still on the cpu as well as the version.
		 * - We must read TSC of pvti's VCPU.
		 * - KVM doesn't follow the versioning protocol, so data could
		 *   change before version if we left the VCPU.
		 * We could have been migrated just after the first
		 * vgetcpu but before fetching the version, so we
		 * wouldn't notice a version change.
		 */
		smp_rmb();
	} while (unlikely((pvti->pvti.version & 1) ||
			  pvti->pvti.version != version ||
			  pvti->migrate_count != migrate_count));
		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
	} while (unlikely(cpu != cpu1 ||
			  (pvti->pvti.version & 1) ||
			  pvti->pvti.version != version));

	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
		*mode = VCLOCK_NONE;
+0 −8
Original line number Diff line number Diff line
@@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
extern void calc_global_load(unsigned long ticks);
extern void update_cpu_load_nohz(void);

/* Notifier for when a task gets migrated to a new CPU */
struct task_migration_notifier {
	struct task_struct *task;
	int from_cpu;
	int to_cpu;
};
extern void register_task_migration_notifier(struct notifier_block *n);

extern unsigned long get_parent_ip(unsigned long addr);

extern void dump_cpu_task(int cpu);
Loading