Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7897986b authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds
Browse files

[PATCH] sched: balance timers



Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.

Signed-off-by: default avatarNick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 99b61ccf
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
	.imbalance_pct		= 125,			\
	.cache_hot_time		= (10*1000000),		\
	.cache_nice_tries	= 1,			\
	.busy_idx		= 3,			\
	.idle_idx		= 1,			\
	.newidle_idx		= 2,			\
	.wake_idx		= 1,			\
	.per_cpu_gain		= 100,			\
	.flags			= SD_LOAD_BALANCE	\
				| SD_BALANCE_EXEC	\
+5 −1
Original line number Diff line number Diff line
@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
	.busy_factor		= 32,			\
	.imbalance_pct		= 125,			\
	.cache_hot_time		= (10*1000000),		\
	.cache_nice_tries	= 1,			\
	.cache_nice_tries	= 2,			\
	.busy_idx		= 3,			\
	.idle_idx		= 2,			\
	.newidle_idx		= 1, 			\
	.wake_idx		= 1,			\
	.per_cpu_gain		= 100,			\
	.flags			= SD_LOAD_BALANCE	\
				| SD_BALANCE_NEWIDLE	\
+4 −0
Original line number Diff line number Diff line
@@ -488,6 +488,10 @@ struct sched_domain {
	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
	unsigned int busy_idx;
	unsigned int idle_idx;
	unsigned int newidle_idx;
	unsigned int wake_idx;
	int flags;			/* See SD_* */

	/* Runtime fields. */
+8 −0
Original line number Diff line number Diff line
@@ -89,6 +89,10 @@
	.cache_hot_time		= 0,			\
	.cache_nice_tries	= 0,			\
	.per_cpu_gain		= 25,			\
	.busy_idx		= 0,			\
	.idle_idx		= 0,			\
	.newidle_idx		= 0,			\
	.wake_idx		= 0,			\
	.flags			= SD_LOAD_BALANCE	\
				| SD_BALANCE_NEWIDLE	\
				| SD_BALANCE_EXEC	\
@@ -115,6 +119,10 @@
	.cache_hot_time		= (5*1000000/2),	\
	.cache_nice_tries	= 1,			\
	.per_cpu_gain		= 100,			\
	.busy_idx		= 2,			\
	.idle_idx		= 0,			\
	.newidle_idx		= 1,			\
	.wake_idx		= 1,			\
	.flags			= SD_LOAD_BALANCE	\
				| SD_BALANCE_NEWIDLE	\
				| SD_BALANCE_EXEC	\
+74 −64
Original line number Diff line number Diff line
@@ -206,7 +206,7 @@ struct runqueue {
	 */
	unsigned long nr_running;
#ifdef CONFIG_SMP
	unsigned long cpu_load;
	unsigned long cpu_load[3];
#endif
	unsigned long long nr_switches;

@@ -886,23 +886,27 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
static inline unsigned long source_load(int cpu)
static inline unsigned long source_load(int cpu, int type)
{
	runqueue_t *rq = cpu_rq(cpu);
	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
	if (type == 0)
		return load_now;

	return min(rq->cpu_load, load_now);
	return min(rq->cpu_load[type-1], load_now);
}

/*
 * Return a high guess at the load of a migration-target cpu
 */
static inline unsigned long target_load(int cpu)
static inline unsigned long target_load(int cpu, int type)
{
	runqueue_t *rq = cpu_rq(cpu);
	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
	if (type == 0)
		return load_now;

	return max(rq->cpu_load, load_now);
	return max(rq->cpu_load[type-1], load_now);
}

#endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
	runqueue_t *rq;
#ifdef CONFIG_SMP
	unsigned long load, this_load;
	struct sched_domain *sd;
	struct sched_domain *sd, *this_sd = NULL;
	int new_cpu;
#endif

@@ -986,26 +990,34 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
	if (unlikely(task_running(rq, p)))
		goto out_activate;

#ifdef CONFIG_SCHEDSTATS
	new_cpu = cpu;

	schedstat_inc(rq, ttwu_cnt);
	if (cpu == this_cpu) {
		schedstat_inc(rq, ttwu_local);
	} else {
		goto out_set_cpu;
	}

	for_each_domain(this_cpu, sd) {
		if (cpu_isset(cpu, sd->span)) {
			schedstat_inc(sd, ttwu_wake_remote);
			this_sd = sd;
			break;
		}
	}
	}
#endif

	new_cpu = cpu;
	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
		goto out_set_cpu;

	load = source_load(cpu);
	this_load = target_load(this_cpu);
	/*
	 * Check for affine wakeup and passive balancing possibilities.
	 */
	if (this_sd) {
		int idx = this_sd->wake_idx;
		unsigned int imbalance;

		load = source_load(cpu, idx);
		this_load = target_load(this_cpu, idx);

		/*
		 * If sync wakeup then subtract the (maximum possible) effect of
@@ -1020,40 +1032,24 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)

		new_cpu = this_cpu; /* Wake to this CPU if we can */

	/*
	 * Scan domains for affine wakeup and passive balancing
	 * possibilities.
	 */
	for_each_domain(this_cpu, sd) {
		unsigned int imbalance;
		/*
		 * Start passive balancing when half the imbalance_pct
		 * limit is reached.
		 */
		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;

		if ((sd->flags & SD_WAKE_AFFINE) &&
				!task_hot(p, rq->timestamp_last_tick, sd)) {
		if ((this_sd->flags & SD_WAKE_AFFINE) &&
			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
			/*
			 * This domain has SD_WAKE_AFFINE and p is cache cold
			 * in this domain.
			 */
			if (cpu_isset(cpu, sd->span)) {
				schedstat_inc(sd, ttwu_move_affine);
			schedstat_inc(this_sd, ttwu_move_affine);
			goto out_set_cpu;
			}
		} else if ((sd->flags & SD_WAKE_BALANCE) &&
		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
				imbalance*this_load <= 100*load) {
			/*
			 * This domain has SD_WAKE_BALANCE and there is
			 * an imbalance.
			 */
			if (cpu_isset(cpu, sd->span)) {
				schedstat_inc(sd, ttwu_move_balance);
			schedstat_inc(this_sd, ttwu_move_balance);
			goto out_set_cpu;
		}
	}
	}

	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
	cpus_and(mask, sd->span, p->cpus_allowed);

	for_each_cpu_mask(i, mask) {
		load = target_load(i);
		load = target_load(i, sd->wake_idx);

		if (load < min_load) {
			min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
	}

	/* add +1 to account for the new task */
	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;

	/*
	 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
{
	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
	int load_idx;

	max_load = this_load = total_load = total_pwr = 0;
	if (idle == NOT_IDLE)
		load_idx = sd->busy_idx;
	else if (idle == NEWLY_IDLE)
		load_idx = sd->newidle_idx;
	else
		load_idx = sd->idle_idx;

	do {
		unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
		for_each_cpu_mask(i, group->cpumask) {
			/* Bias balancing toward cpus of our domain */
			if (local_group)
				load = target_load(i);
				load = target_load(i, load_idx);
			else
				load = source_load(i);
				load = source_load(i, load_idx);

			avg_load += load;
		}
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
	int i;

	for_each_cpu_mask(i, group->cpumask) {
		load = source_load(i);
		load = source_load(i, 0);

		if (load > max_load) {
			max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
	unsigned long old_load, this_load;
	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
	struct sched_domain *sd;
	int i;

	/* Update our load */
	old_load = this_rq->cpu_load;
	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
	/* Update our load */
	for (i = 0; i < 3; i++) {
		unsigned long new_load = this_load;
		int scale = 1 << i;
		old_load = this_rq->cpu_load[i];
		/*
		 * Round up the averaging division if load is increasing. This
		 * prevents us from getting stuck on 9 if the load is 10, for
		 * example.
		 */
	if (this_load > old_load)
		old_load++;
	this_rq->cpu_load = (old_load + this_load) / 2;
		if (new_load > old_load)
			new_load += scale-1;
		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
	}

	for_each_domain(this_cpu, sd) {
		unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)

		rq = cpu_rq(i);
		spin_lock_init(&rq->lock);
		rq->nr_running = 0;
		rq->active = rq->arrays;
		rq->expired = rq->arrays + 1;
		rq->best_expired_prio = MAX_PRIO;

#ifdef CONFIG_SMP
		rq->sd = &sched_domain_dummy;
		rq->cpu_load = 0;
		for (j = 1; j < 3; j++)
			rq->cpu_load[j] = 0;
		rq->active_balance = 0;
		rq->push_cpu = 0;
		rq->migration_thread = NULL;