Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f91a8b44 authored by Jeremy Fitzhardinge's avatar Jeremy Fitzhardinge Committed by Jeremy Fitzhardinge
Browse files

xen: Account for stolen time



This patch accounts for the time stolen from our VCPUs.  Stolen time is
time where a vcpu is runnable and could be running, but all available
physical CPUs are being used for something else.

This accounting gets run on each timer interrupt, just as a way to get
it run relatively often, and when interesting things are going on.
Stolen time is not really used by much in the kernel; it is reported
in /proc/stats, and that's about it.

Signed-off-by: default avatarJeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: default avatarChris Wright <chrisw@sous-sol.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
parent 9a4029fd
Loading
Loading
Loading
Loading
+150 −9
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>

#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -25,6 +26,7 @@

/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP	100000
#define NS_PER_TICK	(1000000000LL / HZ)

/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
@@ -37,6 +39,139 @@ struct shadow_time_info {

static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);

/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);

/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);

/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);

/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
	u64 ret;

	if (BITS_PER_LONG < 64) {
		u32 *p32 = (u32 *)p;
		u32 h, l;

		/*
		 * Read high then low, and then make sure high is
		 * still the same; this will only loop if low wraps
		 * and carries into high.
		 * XXX some clean way to make this endian-proof?
		 */
		do {
			h = p32[1];
			barrier();
			l = p32[0];
			barrier();
		} while (p32[1] != h);

		ret = (((u64)h) << 32) | l;
	} else
		ret = *p;

	return ret;
}

/*
 * Runstate accounting
 */
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
	u64 state_time;
	struct vcpu_runstate_info *state;

	preempt_disable();

	state = &__get_cpu_var(runstate);

	/*
	 * The runstate info is always updated by the hypervisor on
	 * the current CPU, so there's no need to use anything
	 * stronger than a compiler barrier when fetching it.
	 */
	do {
		state_time = get64(&state->state_entry_time);
		barrier();
		*res = *state;
		barrier();
	} while (get64(&state->state_entry_time) != state_time);

	preempt_enable();
}

static void setup_runstate_info(int cpu)
{
	struct vcpu_register_runstate_memory_area area;

	area.addr.v = &per_cpu(runstate, cpu);

	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
			       cpu, &area))
		BUG();
}

static void do_stolen_accounting(void)
{
	struct vcpu_runstate_info state;
	struct vcpu_runstate_info *snap;
	s64 blocked, runnable, offline, stolen;
	cputime_t ticks;

	get_runstate_snapshot(&state);

	WARN_ON(state.state != RUNSTATE_running);

	snap = &__get_cpu_var(runstate_snapshot);

	/* work out how much time the VCPU has not been runn*ing*  */
	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];

	*snap = state;

	/* Add the appropriate number of ticks of stolen time,
	   including any left-overs from last time.  Passing NULL to
	   account_steal_time accounts the time as stolen. */
	stolen = runnable + offline + __get_cpu_var(residual_stolen);

	if (stolen < 0)
		stolen = 0;

	ticks = 0;
	while (stolen >= NS_PER_TICK) {
		ticks++;
		stolen -= NS_PER_TICK;
	}
	__get_cpu_var(residual_stolen) = stolen;
	account_steal_time(NULL, ticks);

	/* Add the appropriate number of ticks of blocked time,
	   including any left-overs from last time.  Passing idle to
	   account_steal_time accounts the time as idle/wait. */
	blocked += __get_cpu_var(residual_blocked);

	if (blocked < 0)
		blocked = 0;

	ticks = 0;
	while (blocked >= NS_PER_TICK) {
		ticks++;
		blocked -= NS_PER_TICK;
	}
	__get_cpu_var(residual_blocked) = blocked;
	account_steal_time(idle_task(smp_processor_id()), ticks);
}



/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
	u64 cpu_khz = 1000000ULL << 32;
@@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void)
 * Reads a consistent set of time-base values from Xen, into a shadow data
 * area.
 */
static void get_time_values_from_xen(void)
static unsigned get_time_values_from_xen(void)
{
	struct vcpu_time_info   *src;
	struct shadow_time_info *dst;

	preempt_disable();

	/* src is shared memory with the hypervisor, so we need to
	   make sure we get a consistent snapshot, even in the face of
	   being preempted. */
@@ -79,7 +212,7 @@ static void get_time_values_from_xen(void)
		rmb();		/* test version after fetching data */
	} while ((src->version & 1) | (dst->version ^ src->version));

	preempt_enable();
	return dst->version;
}

/*
@@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
	u64 now, delta;
	rdtscll(now);
	now = native_read_tsc();
	delta = now - shadow->tsc_timestamp;
	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
@@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void)
{
	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
	cycle_t ret;
	unsigned version;

	get_time_values_from_xen();

	do {
		version = get_time_values_from_xen();
		barrier();
		ret = shadow->system_timestamp + get_nsec_offset(shadow);
		barrier();
	} while (version != __get_cpu_var(xen_vcpu)->time.version);

	put_cpu_var(shadow_time);

@@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
		ret = IRQ_HANDLED;
	}

	do_stolen_accounting();

	return ret;
}

@@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu)
	evt->irq = irq;
	clockevents_register_device(evt);

	setup_runstate_info(cpu);

	put_cpu_var(xen_clock_events);
}

@@ -390,7 +531,7 @@ __init void xen_time_init(void)
	clocksource_register(&xen_clocksource);

	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
		/* Successfully turned off 100hz tick, so we have the
		/* Successfully turned off 100Hz tick, so we have the
		   vcpuop-based timer interface */
		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
		xen_clockevent = &xen_vcpuop_clockevent;