Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b29739f9 authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds
Browse files

[PATCH] pi-futex: scheduler support for pi



Add framework to boost/unboost the priority of RT tasks.

This consists of:

 - caching the 'normal' priority in ->normal_prio
 - providing a functions to set/get the priority of the task
 - make sched_setscheduler() aware of boosting

The effective_prio() cleanups also fix a priority-calculation bug pointed out
by Andrey Gelman, in set_user_nice().

has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au>

Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarArjan van de Ven <arjan@linux.intel.com>
Cc: Andrey Gelman <agelman@012.net.il>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 77ba89c5
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -87,6 +87,7 @@ extern struct group_info init_groups;
	.lock_depth	= -1,						\
	.prio		= MAX_PRIO-20,					\
	.static_prio	= MAX_PRIO-20,					\
	.normal_prio	= MAX_PRIO-20,					\
	.policy		= SCHED_NORMAL,					\
	.cpus_allowed	= CPU_MASK_ALL,					\
	.mm		= NULL,						\
@@ -122,6 +123,7 @@ extern struct group_info init_groups;
	.journal_info	= NULL,						\
	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
	.fs_excl	= ATOMIC_INIT(0),				\
	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
}


+19 −2
Original line number Diff line number Diff line
@@ -495,8 +495,11 @@ struct signal_struct {

#define MAX_PRIO		(MAX_RT_PRIO + 40)

#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p)		rt_prio((p)->prio)
#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
#define has_rt_policy(p) \
	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)

/*
 * Some day this will be a full-fledged user tracking system..
@@ -725,7 +728,7 @@ struct task_struct {
#endif
#endif
	int load_weight;	/* for niceness load balancing purposes */
	int prio, static_prio;
	int prio, static_prio, normal_prio;
	struct list_head run_list;
	prio_array_t *array;

@@ -852,6 +855,9 @@ struct task_struct {
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
	spinlock_t alloc_lock;

	/* Protection of the PI data structures: */
	spinlock_t pi_lock;

#ifdef CONFIG_DEBUG_MUTEXES
	/* mutex deadlock detection */
	struct mutex_waiter *blocked_on;
@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
#endif

extern void sched_idle_next(void);

#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(task_t *p);
extern void rt_mutex_setprio(task_t *p, int prio);
#else
static inline int rt_mutex_getprio(task_t *p)
{
	return p->normal_prio;
}
#endif

extern void set_user_nice(task_t *p, long nice);
extern int task_prio(const task_t *p);
extern int task_nice(const task_t *p);
+160 −29
Original line number Diff line number Diff line
@@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
static inline runqueue_t *__task_rq_lock(task_t *p)
	__acquires(rq->lock)
{
	struct runqueue *rq;

repeat_lock_task:
	rq = task_rq(p);
	spin_lock(&rq->lock);
	if (unlikely(rq != task_rq(p))) {
		spin_unlock(&rq->lock);
		goto repeat_lock_task;
	}
	return rq;
}

/*
 * task_rq_lock - lock the runqueue a given task resides on and disable
 * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -375,6 +394,12 @@ static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
	return rq;
}

static inline void __task_rq_unlock(runqueue_t *rq)
	__releases(rq->lock)
{
	spin_unlock(&rq->lock);
}

static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
	__releases(rq->lock)
{
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
}

/*
 * effective_prio - return the priority that is based on the static
 * __normal_prio - return the priority that is based on the static
 * priority but is modified by bonuses/penalties.
 *
 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 *
 * Both properties are important to certain workloads.
 */
static int effective_prio(task_t *p)

static inline int __normal_prio(task_t *p)
{
	int bonus, prio;

	if (rt_task(p))
		return p->prio;

	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

	prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)

static void set_load_weight(task_t *p)
{
	if (rt_task(p)) {
	if (has_rt_policy(p)) {
#ifdef CONFIG_SMP
		if (p == task_rq(p)->migration_thread)
			/*
@@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
	dec_raw_weighted_load(rq, p);
}

/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
static inline int normal_prio(task_t *p)
{
	int prio;

	if (has_rt_policy(p))
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
static int effective_prio(task_t *p)
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

/*
 * __activate_task - move a task to the runqueue.
 */
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
	inc_nr_running(p, rq);
}

/*
 * Recalculate p->normal_prio and p->prio after having slept,
 * updating the sleep-average too:
 */
static int recalc_task_prio(task_t *p, unsigned long long now)
{
	/* Caller must always ensure 'now >= p->timestamp' */
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
	 * event cannot wake it up and insert it on the runqueue either.
	 */
	p->state = TASK_RUNNING;

	/*
	 * Make sure we do not leak PI boosting priority to the child:
	 */
	p->prio = current->normal_prio;

	INIT_LIST_HEAD(&p->run_list);
	p->array = NULL;
#ifdef CONFIG_SCHEDSTATS
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
				__activate_task(p, rq);
			else {
				p->prio = current->prio;
				p->normal_prio = current->normal_prio;
				list_add_tail(&p->run_list, &current->run_list);
				p->array = current->array;
				p->array->nr_active++;
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

EXPORT_SYMBOL(sleep_on_timeout);

#ifdef CONFIG_RT_MUTEXES

/*
 * rt_mutex_setprio - set the current priority of a task
 * @p: task
 * @prio: prio value (kernel-internal form)
 *
 * This function changes the 'effective' priority of a task. It does
 * not touch ->normal_prio like __setscheduler().
 *
 * Used by the rt_mutex code to implement priority inheritance logic.
 */
void rt_mutex_setprio(task_t *p, int prio)
{
	unsigned long flags;
	prio_array_t *array;
	runqueue_t *rq;
	int oldprio;

	BUG_ON(prio < 0 || prio > MAX_PRIO);

	rq = task_rq_lock(p, &flags);

	oldprio = p->prio;
	array = p->array;
	if (array)
		dequeue_task(p, array);
	p->prio = prio;

	if (array) {
		/*
		 * If changing to an RT priority then queue it
		 * in the active array!
		 */
		if (rt_task(p))
			array = rq->active;
		enqueue_task(p, array);
		/*
		 * Reschedule if we are currently running on this runqueue and
		 * our priority decreased, or if we are not currently running on
		 * this runqueue and our priority is higher than the current's
		 */
		if (task_running(rq, p)) {
			if (p->prio > oldprio)
				resched_task(rq->curr);
		} else if (TASK_PREEMPTS_CURR(p, rq))
			resched_task(rq->curr);
	}
	task_rq_unlock(rq, &flags);
}

#endif

void set_user_nice(task_t *p, long nice)
{
	unsigned long flags;
	prio_array_t *array;
	runqueue_t *rq;
	int old_prio, new_prio, delta;
	int old_prio, delta;

	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
		return;
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
	 * it wont have any effect on scheduling until the task is
	 * not SCHED_NORMAL/SCHED_BATCH:
	 */
	if (rt_task(p)) {
	if (has_rt_policy(p)) {
		p->static_prio = NICE_TO_PRIO(nice);
		goto out_unlock;
	}
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
		dec_raw_weighted_load(rq, p);
	}

	old_prio = p->prio;
	new_prio = NICE_TO_PRIO(nice);
	delta = new_prio - old_prio;
	p->static_prio = NICE_TO_PRIO(nice);
	set_load_weight(p);
	p->prio += delta;
	old_prio = p->prio;
	p->prio = effective_prio(p);
	delta = p->prio - old_prio;

	if (array) {
		enqueue_task(p, array);
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
out_unlock:
	task_rq_unlock(rq, &flags);
}

EXPORT_SYMBOL(set_user_nice);

/*
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
	BUG_ON(p->array);
	p->policy = policy;
	p->rt_priority = prio;
	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
	} else {
		p->prio = p->static_prio;
	p->normal_prio = normal_prio(p);
	/* we are holding p->pi_lock already */
	p->prio = rt_mutex_getprio(p);
	/*
	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
	 */
	if (policy == SCHED_BATCH)
		p->sleep_avg = 0;
	}
	set_load_weight(p);
}

@@ -3911,15 +4032,21 @@ int sched_setscheduler(struct task_struct *p, int policy,
	retval = security_task_setscheduler(p, policy, param);
	if (retval)
		return retval;
	/*
	 * make sure no PI-waiters arrive (or leave) while we are
	 * changing the priority of the task:
	 */
	spin_lock_irqsave(&p->pi_lock, flags);
	/*
	 * To be able to change p->policy safely, the apropriate
	 * runqueue lock must be held.
	 */
	rq = task_rq_lock(p, &flags);
	rq = __task_rq_lock(p);
	/* recheck policy now with rq lock held */
	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
		policy = oldpolicy = -1;
		task_rq_unlock(rq, &flags);
		__task_rq_unlock(rq);
		spin_unlock_irqrestore(&p->pi_lock, flags);
		goto recheck;
	}
	array = p->array;
@@ -3940,7 +4067,9 @@ int sched_setscheduler(struct task_struct *p, int policy,
		} else if (TASK_PREEMPTS_CURR(p, rq))
			resched_task(rq->curr);
	}
	task_rq_unlock(rq, &flags);
	__task_rq_unlock(rq);
	spin_unlock_irqrestore(&p->pi_lock, flags);

	return 0;
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
	idle->timestamp = sched_clock();
	idle->sleep_avg = 0;
	idle->array = NULL;
	idle->prio = MAX_PRIO;
	idle->prio = idle->normal_prio = MAX_PRIO;
	idle->state = TASK_RUNNING;
	idle->cpus_allowed = cpumask_of_cpu(cpu);
	set_task_cpu(idle, cpu);
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
		if (!rt_task(p))
			continue;

		rq = task_rq_lock(p, &flags);
		spin_lock_irqsave(&p->pi_lock, flags);
		rq = __task_rq_lock(p);

		array = p->array;
		if (array)
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
			resched_task(rq->curr);
		}

		task_rq_unlock(rq, &flags);
		__task_rq_unlock(rq);
		spin_unlock_irqrestore(&p->pi_lock, flags);
	}
	read_unlock_irq(&tasklist_lock);
}