cpuidle: fix the menu governor to boost IO performance (69d25870) · Commits · e / devices / android_kernel_sony_msm8998

drivers/cpuidle/governors/menu.c

+212 −39

Original line number	Diff line number	Diff line
		@@ -2,8 +2,12 @@
		* menu.c - the menu idle governor
		*
		* Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
		* Copyright (C) 2009 Intel Corporation
		* Author:
		* Arjan van de Ven <arjan@linux.intel.com>
		*
		* This code is licenced under the GPL.
		* This code is licenced under the GPL version 2 as described
		* in the COPYING file that acompanies the Linux Kernel.
		*/

		#include <linux/kernel.h>
		@@ -13,20 +17,153 @@
		#include <linux/ktime.h>
		#include <linux/hrtimer.h>
		#include <linux/tick.h>
		#include <linux/sched.h>

		#define BREAK_FUZZ 4 /* 4 us */
		#define PRED_HISTORY_PCT 50
		#define BUCKETS 12
		#define RESOLUTION 1024
		#define DECAY 4
		#define MAX_INTERESTING 50000

		/*
		* Concepts and ideas behind the menu governor
		*
		* For the menu governor, there are 3 decision factors for picking a C
		* state:
		* 1) Energy break even point
		* 2) Performance impact
		* 3) Latency tolerance (from pmqos infrastructure)
		* These these three factors are treated independently.
		*
		* Energy break even point
		* -----------------------
		* C state entry and exit have an energy cost, and a certain amount of time in
		* the C state is required to actually break even on this cost. CPUIDLE
		* provides us this duration in the "target_residency" field. So all that we
		* need is a good prediction of how long we'll be idle. Like the traditional
		* menu governor, we start with the actual known "next timer event" time.
		*
		* Since there are other source of wakeups (interrupts for example) than
		* the next timer event, this estimation is rather optimistic. To get a
		* more realistic estimate, a correction factor is applied to the estimate,
		* that is based on historic behavior. For example, if in the past the actual
		* duration always was 50% of the next timer tick, the correction factor will
		* be 0.5.
		*
		* menu uses a running average for this correction factor, however it uses a
		* set of factors, not just a single factor. This stems from the realization
		* that the ratio is dependent on the order of magnitude of the expected
		* duration; if we expect 500 milliseconds of idle time the likelihood of
		* getting an interrupt very early is much higher than if we expect 50 micro
		* seconds of idle time. A second independent factor that has big impact on
		* the actual factor is if there is (disk) IO outstanding or not.
		* (as a special twist, we consider every sleep longer than 50 milliseconds
		* as perfect; there are no power gains for sleeping longer than this)
		*
		* For these two reasons we keep an array of 12 independent factors, that gets
		* indexed based on the magnitude of the expected duration as well as the
		* "is IO outstanding" property.
		*
		* Limiting Performance Impact
		* ---------------------------
		* C states, especially those with large exit latencies, can have a real
		* noticable impact on workloads, which is not acceptable for most sysadmins,
		* and in addition, less performance has a power price of its own.
		*
		* As a general rule of thumb, menu assumes that the following heuristic
		* holds:
		* The busier the system, the less impact of C states is acceptable
		*
		* This rule-of-thumb is implemented using a performance-multiplier:
		* If the exit latency times the performance multiplier is longer than
		* the predicted duration, the C state is not considered a candidate
		* for selection due to a too high performance impact. So the higher
		* this multiplier is, the longer we need to be idle to pick a deep C
		* state, and thus the less likely a busy CPU will hit such a deep
		* C state.
		*
		* Two factors are used in determing this multiplier:
		* a value of 10 is added for each point of "per cpu load average" we have.
		* a value of 5 points is added for each process that is waiting for
		* IO on this CPU.
		* (these values are experimentally determined)
		*
		* The load average factor gives a longer term (few seconds) input to the
		* decision, while the iowait value gives a cpu local instantanious input.
		* The iowait factor may look low, but realize that this is also already
		* represented in the system load average.
		*
		*/

		struct menu_device {
		int last_state_idx;

		unsigned int expected_us;
		unsigned int predicted_us;
		unsigned int current_predicted_us;
		unsigned int last_measured_us;
		unsigned int elapsed_us;
		u64 predicted_us;
		unsigned int measured_us;
		unsigned int exit_us;
		unsigned int bucket;
		u64 correction_factor[BUCKETS];
		};


		#define LOAD_INT(x) ((x) >> FSHIFT)
		#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)

		static int get_loadavg(void)
		{
		unsigned long this = this_cpu_load();


		return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
		}

		static inline int which_bucket(unsigned int duration)
		{
		int bucket = 0;

		/*
		* We keep two groups of stats; one with no
		* IO pending, one without.
		* This allows us to calculate
		* E(duration)\|iowait
		*/
		if (nr_iowait_cpu())
		bucket = BUCKETS/2;

		if (duration < 10)
		return bucket;
		if (duration < 100)
		return bucket + 1;
		if (duration < 1000)
		return bucket + 2;
		if (duration < 10000)
		return bucket + 3;
		if (duration < 100000)
		return bucket + 4;
		return bucket + 5;
		}

		/*
		* Return a multiplier for the exit latency that is intended
		* to take performance requirements into account.
		* The more performance critical we estimate the system
		* to be, the higher this multiplier, and thus the higher
		* the barrier to go to an expensive C state.
		*/
		static inline int performance_multiplier(void)
		{
		int mult = 1;

		/* for higher loadavg, we are more reluctant */

		mult += 2 * get_loadavg();

		/* for IO wait tasks (per cpu!) we add 5x each */
		mult += 10 * nr_iowait_cpu();

		return mult;
		}

		static DEFINE_PER_CPU(struct menu_device, menu_devices);

		/**
		@@ -38,37 +175,59 @@ static int menu_select(struct cpuidle_device *dev)
		struct menu_device *data = &__get_cpu_var(menu_devices);
		int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
		int i;
		int multiplier;

		/* Special case when user has set very strict latency requirement */
		if (unlikely(latency_req == 0)) {
		data->last_state_idx = 0;
		data->exit_us = 0;

		/* Special case when user has set very strict latency requirement */
		if (unlikely(latency_req == 0))
		return 0;
		}

		/* determine the expected residency time */
		/* determine the expected residency time, round up */
		data->expected_us =
		(u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000;
		DIV_ROUND_UP((u32)ktime_to_ns(tick_nohz_get_sleep_length()), 1000);


		data->bucket = which_bucket(data->expected_us);

		multiplier = performance_multiplier();

		/*
		* if the correction factor is 0 (eg first time init or cpu hotplug
		* etc), we actually want to start out with a unity factor.
		*/
		if (data->correction_factor[data->bucket] == 0)
		data->correction_factor[data->bucket] = RESOLUTION * DECAY;

		/* Make sure to round up for half microseconds */
		data->predicted_us = DIV_ROUND_CLOSEST(
		data->expected_us * data->correction_factor[data->bucket],
		RESOLUTION * DECAY);

		/*
		* We want to default to C1 (hlt), not to busy polling
		* unless the timer is happening really really soon.
		*/
		if (data->expected_us > 5)
		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;

		/* Recalculate predicted_us based on prediction_history_pct */
		data->predicted_us *= PRED_HISTORY_PCT;
		data->predicted_us += (100 - PRED_HISTORY_PCT) *
		data->current_predicted_us;
		data->predicted_us /= 100;

		/* find the deepest idle state that satisfies our constraints */
		for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) {
		for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
		struct cpuidle_state *s = &dev->states[i];

		if (s->target_residency > data->expected_us)
		break;
		if (s->target_residency > data->predicted_us)
		break;
		if (s->exit_latency > latency_req)
		break;
		if (s->exit_latency * multiplier > data->predicted_us)
		break;
		data->exit_us = s->exit_latency;
		data->last_state_idx = i;
		}

		data->last_state_idx = i - 1;
		return i - 1;
		return data->last_state_idx;
		}

		/**
		@@ -85,35 +244,49 @@ static void menu_reflect(struct cpuidle_device *dev)
		unsigned int last_idle_us = cpuidle_get_last_residency(dev);
		struct cpuidle_state *target = &dev->states[last_idx];
		unsigned int measured_us;
		u64 new_factor;

		/*
		* Ugh, this idle state doesn't support residency measurements, so we
		* are basically lost in the dark. As a compromise, assume we slept
		* for one full standard timer tick. However, be aware that this
		* could potentially result in a suboptimal state transition.
		* for the whole expected time.
		*/
		if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID)))
		last_idle_us = USEC_PER_SEC / HZ;
		last_idle_us = data->expected_us;


		measured_us = last_idle_us;

		/*
		* measured_us and elapsed_us are the cumulative idle time, since the
		* last time we were woken out of idle by an interrupt.
		* We correct for the exit latency; we are assuming here that the
		* exit latency happens after the event that we're interested in.
		*/
		if (data->elapsed_us <= data->elapsed_us + last_idle_us)
		measured_us = data->elapsed_us + last_idle_us;
		if (measured_us > data->exit_us)
		measured_us -= data->exit_us;


		/* update our correction ratio */

		new_factor = data->correction_factor[data->bucket]
		* (DECAY - 1) / DECAY;

		if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING)
		new_factor += RESOLUTION * measured_us / data->expected_us;
		else
		measured_us = -1;
		/*
		* we were idle so long that we count it as a perfect
		* prediction
		*/
		new_factor += RESOLUTION;

		/* Predict time until next break event */
		data->current_predicted_us = max(measured_us, data->last_measured_us);
		/*
		* We don't want 0 as factor; we always want at least
		* a tiny bit of estimated time.
		*/
		if (new_factor == 0)
		new_factor = 1;

		if (last_idle_us + BREAK_FUZZ <
		data->expected_us - target->exit_latency) {
		data->last_measured_us = measured_us;
		data->elapsed_us = 0;
		} else {
		data->elapsed_us = measured_us;
		}
		data->correction_factor[data->bucket] = new_factor;
		}

		/**

include/linux/sched.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -140,6 +140,10 @@ extern int nr_processes(void);
		extern unsigned long nr_running(void);
		extern unsigned long nr_uninterruptible(void);
		extern unsigned long nr_iowait(void);
		extern unsigned long nr_iowait_cpu(void);
		extern unsigned long this_cpu_load(void);


		extern void calc_global_load(void);
		extern u64 cpu_nr_migrations(int cpu);

kernel/sched.c

+13 −0

Original line number	Diff line number	Diff line
		@@ -2904,6 +2904,19 @@ unsigned long nr_iowait(void)
		return sum;
		}

		unsigned long nr_iowait_cpu(void)
		{
		struct rq *this = this_rq();
		return atomic_read(&this->nr_iowait);
		}

		unsigned long this_cpu_load(void)
		{
		struct rq *this = this_rq();
		return this->cpu_load[0];
		}


		/* Variables and functions for calc_load */
		static atomic_long_t calc_load_tasks;
		static unsigned long calc_load_update;