Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e7693a36 authored by Gregory Haskins's avatar Gregory Haskins Committed by Ingo Molnar
Browse files

sched: de-SCHED_OTHER-ize the RT path



The current wake-up code path tries to determine if it can optimize the
wake-up to "this_cpu" by computing load calculations.  The problem is that
these calculations are only relevant to SCHED_OTHER tasks where load is king.
For RT tasks, priority is king.  So the load calculation is completely wasted
bandwidth.

Therefore, we create a new sched_class interface to help with
pre-wakeup routing decisions and move the load calculation as a function
of CFS task's class.

Signed-off-by: default avatarGregory Haskins <ghaskins@novell.com>
Signed-off-by: default avatarSteven Rostedt <srostedt@redhat.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 697f0a48
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -827,6 +827,7 @@ struct sched_class {
	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
	void (*yield_task) (struct rq *rq);
	int  (*select_task_rq)(struct task_struct *p, int sync);

	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);

+27 −140
Original line number Diff line number Diff line
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
	update_load_sub(&rq->load, load);
}

#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#endif /* CONFIG_SMP */

#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
/*
 * Is this task likely cache-hot:
 */
static inline int
static int
task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
	s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
/*
 * Return the average load per task on the cpu's run queue
 */
static inline unsigned long cpu_avg_load_per_task(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)

#endif /* CONFIG_SMP */

/*
 * wake_idle() will wake a task on an idle cpu if task->cpu is
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
 *
 * Returns the CPU we should wake onto.
 */
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
	cpumask_t tmp;
	struct sched_domain *sd;
	int i;

	/*
	 * If it is idle, then it is the best cpu to run this task.
	 *
	 * This cpu is also the best, if it has more than one task already.
	 * Siblings must be also busy(in most cases) as they didn't already
	 * pickup the extra load from this cpu and hence we need not check
	 * sibling runqueue info. This will avoid the checks and cache miss
	 * penalities associated with that.
	 */
	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
		return cpu;

	for_each_domain(cpu, sd) {
		if (sd->flags & SD_WAKE_IDLE) {
			cpus_and(tmp, sd->span, p->cpus_allowed);
			for_each_cpu_mask(i, tmp) {
				if (idle_cpu(i)) {
					if (i != task_cpu(p)) {
						schedstat_inc(p,
							se.nr_wakeups_idle);
					}
					return i;
				}
			}
		} else {
			break;
		}
	}
	return cpu;
}
#else
static inline int wake_idle(int cpu, struct task_struct *p)
{
	return cpu;
}
#endif

/***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
	long old_state;
	struct rq *rq;
#ifdef CONFIG_SMP
	struct sched_domain *sd, *this_sd = NULL;
	unsigned long load, this_load;
	int new_cpu;
#endif

@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
	if (unlikely(task_running(rq, p)))
		goto out_activate;

	new_cpu = cpu;

	schedstat_inc(rq, ttwu_count);
	if (cpu == this_cpu) {
		schedstat_inc(rq, ttwu_local);
		goto out_set_cpu;
	}

	for_each_domain(this_cpu, sd) {
		if (cpu_isset(cpu, sd->span)) {
			schedstat_inc(sd, ttwu_wake_remote);
			this_sd = sd;
			break;
		}
	}

	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
		goto out_set_cpu;

	/*
	 * Check for affine wakeup and passive balancing possibilities.
	 */
	if (this_sd) {
		int idx = this_sd->wake_idx;
		unsigned int imbalance;

		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

		load = source_load(cpu, idx);
		this_load = target_load(this_cpu, idx);

		new_cpu = this_cpu; /* Wake to this CPU if we can */

		if (this_sd->flags & SD_WAKE_AFFINE) {
			unsigned long tl = this_load;
			unsigned long tl_per_task;

			/*
			 * Attract cache-cold tasks on sync wakeups:
			 */
			if (sync && !task_hot(p, rq->clock, this_sd))
				goto out_set_cpu;

			schedstat_inc(p, se.nr_wakeups_affine_attempts);
			tl_per_task = cpu_avg_load_per_task(this_cpu);

			/*
			 * If sync wakeup then subtract the (maximum possible)
			 * effect of the currently running task from the load
			 * of the current CPU:
			 */
			if (sync)
				tl -= current->se.load.weight;

			if ((tl <= load &&
				tl + target_load(cpu, idx) <= tl_per_task) ||
			       100*(tl + p->se.load.weight) <= imbalance*load) {
				/*
				 * This domain has SD_WAKE_AFFINE and
				 * p is cache cold in this domain, and
				 * there is no bad imbalance.
				 */
				schedstat_inc(this_sd, ttwu_move_affine);
				schedstat_inc(p, se.nr_wakeups_affine);
				goto out_set_cpu;
			}
		}

		/*
		 * Start passive balancing when half the imbalance_pct
		 * limit is reached.
		 */
		if (this_sd->flags & SD_WAKE_BALANCE) {
			if (imbalance*this_load <= 100*load) {
				schedstat_inc(this_sd, ttwu_move_balance);
				schedstat_inc(p, se.nr_wakeups_passive);
				goto out_set_cpu;
			}
		}
	}

	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
	new_cpu = wake_idle(new_cpu, p);
	new_cpu = p->sched_class->select_task_rq(p, sync);
	if (new_cpu != cpu) {
		set_task_cpu(p, new_cpu);
		task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
		cpu = task_cpu(p);
	}

#ifdef CONFIG_SCHEDSTATS
	schedstat_inc(rq, ttwu_count);
	if (cpu == this_cpu)
		schedstat_inc(rq, ttwu_local);
	else {
		struct sched_domain *sd;
		for_each_domain(this_cpu, sd) {
			if (cpu_isset(cpu, sd->span)) {
				schedstat_inc(sd, ttwu_wake_remote);
				break;
			}
		}
	}

#endif


out_activate:
#endif /* CONFIG_SMP */
	schedstat_inc(p, se.nr_wakeups);
+148 −0
Original line number Diff line number Diff line
@@ -860,6 +860,151 @@ static void yield_task_fair(struct rq *rq)
	se->vruntime = rightmost->vruntime + 1;
}

/*
 * wake_idle() will wake a task on an idle cpu if task->cpu is
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
 *
 * Returns the CPU we should wake onto.
 */
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
	cpumask_t tmp;
	struct sched_domain *sd;
	int i;

	/*
	 * If it is idle, then it is the best cpu to run this task.
	 *
	 * This cpu is also the best, if it has more than one task already.
	 * Siblings must be also busy(in most cases) as they didn't already
	 * pickup the extra load from this cpu and hence we need not check
	 * sibling runqueue info. This will avoid the checks and cache miss
	 * penalities associated with that.
	 */
	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
		return cpu;

	for_each_domain(cpu, sd) {
		if (sd->flags & SD_WAKE_IDLE) {
			cpus_and(tmp, sd->span, p->cpus_allowed);
			for_each_cpu_mask(i, tmp) {
				if (idle_cpu(i)) {
					if (i != task_cpu(p)) {
						schedstat_inc(p,
						       se.nr_wakeups_idle);
					}
					return i;
				}
			}
		} else {
			break;
		}
	}
	return cpu;
}
#else
static inline int wake_idle(int cpu, struct task_struct *p)
{
	return cpu;
}
#endif

#ifdef CONFIG_SMP
static int select_task_rq_fair(struct task_struct *p, int sync)
{
	int cpu, this_cpu;
	struct rq *rq;
	struct sched_domain *sd, *this_sd = NULL;
	int new_cpu;

	cpu      = task_cpu(p);
	rq       = task_rq(p);
	this_cpu = smp_processor_id();
	new_cpu  = cpu;

	for_each_domain(this_cpu, sd) {
		if (cpu_isset(cpu, sd->span)) {
			this_sd = sd;
			break;
		}
	}

	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
		goto out_set_cpu;

	/*
	 * Check for affine wakeup and passive balancing possibilities.
	 */
	if (this_sd) {
		int idx = this_sd->wake_idx;
		unsigned int imbalance;
		unsigned long load, this_load;

		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

		load = source_load(cpu, idx);
		this_load = target_load(this_cpu, idx);

		new_cpu = this_cpu; /* Wake to this CPU if we can */

		if (this_sd->flags & SD_WAKE_AFFINE) {
			unsigned long tl = this_load;
			unsigned long tl_per_task;

			/*
			 * Attract cache-cold tasks on sync wakeups:
			 */
			if (sync && !task_hot(p, rq->clock, this_sd))
				goto out_set_cpu;

			schedstat_inc(p, se.nr_wakeups_affine_attempts);
			tl_per_task = cpu_avg_load_per_task(this_cpu);

			/*
			 * If sync wakeup then subtract the (maximum possible)
			 * effect of the currently running task from the load
			 * of the current CPU:
			 */
			if (sync)
				tl -= current->se.load.weight;

			if ((tl <= load &&
				tl + target_load(cpu, idx) <= tl_per_task) ||
			       100*(tl + p->se.load.weight) <= imbalance*load) {
				/*
				 * This domain has SD_WAKE_AFFINE and
				 * p is cache cold in this domain, and
				 * there is no bad imbalance.
				 */
				schedstat_inc(this_sd, ttwu_move_affine);
				schedstat_inc(p, se.nr_wakeups_affine);
				goto out_set_cpu;
			}
		}

		/*
		 * Start passive balancing when half the imbalance_pct
		 * limit is reached.
		 */
		if (this_sd->flags & SD_WAKE_BALANCE) {
			if (imbalance*this_load <= 100*load) {
				schedstat_inc(this_sd, ttwu_move_balance);
				schedstat_inc(p, se.nr_wakeups_passive);
				goto out_set_cpu;
			}
		}
	}

	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
	return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */


/*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_fair,
#endif /* CONFIG_SMP */

	.check_preempt_curr	= check_preempt_wakeup,

+9 −0
Original line number Diff line number Diff line
@@ -5,6 +5,12 @@
 *  handled in sched_fair.c)
 */

#ifdef CONFIG_SMP
static int select_task_rq_idle(struct task_struct *p, int sync)
{
	return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
 * Idle tasks are unconditionally rescheduled:
 */
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = {

	/* dequeue is not valid, we print a debug message there: */
	.dequeue_task		= dequeue_task_idle,
#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_idle,
#endif /* CONFIG_SMP */

	.check_preempt_curr	= check_preempt_curr_idle,

+10 −0
Original line number Diff line number Diff line
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq)
	requeue_task_rt(rq, rq->curr);
}

#ifdef CONFIG_SMP
static int select_task_rq_rt(struct task_struct *p, int sync)
{
	return task_cpu(p);
}
#endif /* CONFIG_SMP */

/*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = {
	.enqueue_task		= enqueue_task_rt,
	.dequeue_task		= dequeue_task_rt,
	.yield_task		= yield_task_rt,
#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_rt,
#endif /* CONFIG_SMP */

	.check_preempt_curr	= check_preempt_curr_rt,