Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1de64443 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar
Browse files

sched/core: Fix task and run queue sched_info::run_delay inconsistencies



Mike Meyer reported the following bug:

> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected.  Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
>
> Consider the following sequence:
>
>     a. thread is running.
>     b. thread moves beween cgroups, changes scheduling class or priority.
>     c. thread sleeps OR
>     d. thread involuntarily gives up cpu.
>
> a. implies:
>
>     thread->sched_info.last_queued = 0
>
> a. and b. results in the following:
>
>     1. dequeue_task(rq, thread)
>
>            sched_info_dequeued(rq, thread)
>                delta = 0
>
>                sched_info_reset_dequeued(thread)
>                    thread->sched_info.last_queued = 0
>
>                thread->sched_info.run_delay += delta
>
>     2. enqueue_task(rq, thread)
>
>            sched_info_queued(rq, thread)
>
>                /* thread is still on cpu at this point. */
>                thread->sched_info.last_queued = task_rq(thread)->clock;
>
> c. results in:
>
>     dequeue_task(rq, thread)
>
>         sched_info_dequeued(rq, thread)
>
>             /* delta is execution time not run_delay. */
>             delta = task_rq(thread)->clock - thread->sched_info.last_queued
>
>         sched_info_reset_dequeued(thread)
>             thread->sched_info.last_queued = 0
>
>         thread->sched_info.run_delay += delta
>
>     Since thread was running between enqueue_task(rq, thread) and
>     dequeue_task(rq, thread), the delta above is really execution
>     time and not run_delay.
>
> d. results in:
>
>     __sched_info_switch(thread, next_thread)
>
>         sched_info_depart(rq, thread)
>
>             sched_info_queued(rq, thread)
>
>                 /* last_queued not updated due to being non-zero */
>                 return
>
>     Since thread was running between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread), the execution time
>     between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread) now will become
>     associated with run_delay due to when last_queued was last updated.
>

This alternative patch solves the problem by not calling
sched_info_{de,}queued() in {de,en}queue_task(). Therefore the
sched_info state is preserved and things work as expected.

By inlining the {de,en}queue_task() functions the new condition
becomes (mostly) a compile-time constant and we'll not emit any new
branch instructions.

It even shrinks the code (due to inlining {en,de}queue_task()):

$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
   text    data     bss     dec     hex filename
  64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
  64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig

Reported-by: default avatarMike Meyer <Mike.Meyer@Teradata.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent b52da86e
Loading
Loading
Loading
Loading
+25 −19
Original line number Diff line number Diff line
@@ -827,16 +827,18 @@ static void set_load_weight(struct task_struct *p)
	load->inv_weight = prio_to_wmult[prio];
}

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	if (!(flags & ENQUEUE_RESTORE))
		sched_info_queued(rq, p);
	p->sched_class->enqueue_task(rq, p, flags);
}

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	if (!(flags & DEQUEUE_SAVE))
		sched_info_dequeued(rq, p);
	p->sched_class->dequeue_task(rq, p, flags);
}
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
		 * holding rq->lock.
		 */
		lockdep_assert_held(&rq->lock);
		dequeue_task(rq, p, 0);
		dequeue_task(rq, p, DEQUEUE_SAVE);
	}
	if (running)
		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
	if (running)
		p->sched_class->set_curr_task(rq);
	if (queued)
		enqueue_task(rq, p, 0);
		enqueue_task(rq, p, ENQUEUE_RESTORE);
}

/*
@@ -1692,7 +1694,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#endif /* CONFIG_SCHEDSTATS */
}

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
	activate_task(rq, p, en_flags);
	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3325,7 +3327,7 @@ EXPORT_SYMBOL(default_wake_function);
 */
void rt_mutex_setprio(struct task_struct *p, int prio)
{
	int oldprio, queued, running, enqueue_flag = 0;
	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
	struct rq *rq;
	const struct sched_class *prev_class;

@@ -3357,7 +3359,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
	queued = task_on_rq_queued(p);
	running = task_current(rq, p);
	if (queued)
		dequeue_task(rq, p, 0);
		dequeue_task(rq, p, DEQUEUE_SAVE);
	if (running)
		put_prev_task(rq, p);

@@ -3375,7 +3377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
		if (!dl_prio(p->normal_prio) ||
		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
			p->dl.dl_boosted = 1;
			enqueue_flag = ENQUEUE_REPLENISH;
			enqueue_flag |= ENQUEUE_REPLENISH;
		} else
			p->dl.dl_boosted = 0;
		p->sched_class = &dl_sched_class;
@@ -3383,7 +3385,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
		if (dl_prio(oldprio))
			p->dl.dl_boosted = 0;
		if (oldprio < prio)
			enqueue_flag = ENQUEUE_HEAD;
			enqueue_flag |= ENQUEUE_HEAD;
		p->sched_class = &rt_sched_class;
	} else {
		if (dl_prio(oldprio))
@@ -3435,7 +3437,7 @@ void set_user_nice(struct task_struct *p, long nice)
	}
	queued = task_on_rq_queued(p);
	if (queued)
		dequeue_task(rq, p, 0);
		dequeue_task(rq, p, DEQUEUE_SAVE);

	p->static_prio = NICE_TO_PRIO(nice);
	set_load_weight(p);
@@ -3444,7 +3446,7 @@ void set_user_nice(struct task_struct *p, long nice)
	delta = p->prio - old_prio;

	if (queued) {
		enqueue_task(rq, p, 0);
		enqueue_task(rq, p, ENQUEUE_RESTORE);
		/*
		 * If the task increased its priority or is running and
		 * lowered its priority, then reschedule its CPU:
@@ -3946,7 +3948,7 @@ static int __sched_setscheduler(struct task_struct *p,
	queued = task_on_rq_queued(p);
	running = task_current(rq, p);
	if (queued)
		dequeue_task(rq, p, 0);
		dequeue_task(rq, p, DEQUEUE_SAVE);
	if (running)
		put_prev_task(rq, p);

@@ -3956,11 +3958,15 @@ static int __sched_setscheduler(struct task_struct *p,
	if (running)
		p->sched_class->set_curr_task(rq);
	if (queued) {
		int enqueue_flags = ENQUEUE_RESTORE;
		/*
		 * We enqueue to tail when the priority of a task is
		 * increased (user space view).
		 */
		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
		if (oldprio <= p->prio)
			enqueue_flags |= ENQUEUE_HEAD;

		enqueue_task(rq, p, enqueue_flags);
	}

	check_class_changed(rq, p, prev_class, oldprio);
@@ -5109,7 +5115,7 @@ void sched_setnuma(struct task_struct *p, int nid)
	running = task_current(rq, p);

	if (queued)
		dequeue_task(rq, p, 0);
		dequeue_task(rq, p, DEQUEUE_SAVE);
	if (running)
		put_prev_task(rq, p);

@@ -5118,7 +5124,7 @@ void sched_setnuma(struct task_struct *p, int nid)
	if (running)
		p->sched_class->set_curr_task(rq);
	if (queued)
		enqueue_task(rq, p, 0);
		enqueue_task(rq, p, ENQUEUE_RESTORE);
	task_rq_unlock(rq, p, &flags);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -7737,7 +7743,7 @@ void sched_move_task(struct task_struct *tsk)
	queued = task_on_rq_queued(tsk);

	if (queued)
		dequeue_task(rq, tsk, 0);
		dequeue_task(rq, tsk, DEQUEUE_SAVE);
	if (unlikely(running))
		put_prev_task(rq, tsk);

@@ -7761,7 +7767,7 @@ void sched_move_task(struct task_struct *tsk)
	if (unlikely(running))
		tsk->sched_class->set_curr_task(rq);
	if (queued)
		enqueue_task(rq, tsk, 0);
		enqueue_task(rq, tsk, ENQUEUE_RESTORE);

	task_rq_unlock(rq, tsk, &flags);
}
+8 −6
Original line number Diff line number Diff line
@@ -1151,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

#define ENQUEUE_WAKEUP		1
#define ENQUEUE_HEAD		2
#define ENQUEUE_WAKEUP		0x01
#define ENQUEUE_HEAD		0x02
#ifdef CONFIG_SMP
#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
#else
#define ENQUEUE_WAKING		0
#define ENQUEUE_WAKING		0x00
#endif
#define ENQUEUE_REPLENISH	8
#define ENQUEUE_REPLENISH	0x08
#define ENQUEUE_RESTORE	0x10

#define DEQUEUE_SLEEP		1
#define DEQUEUE_SLEEP		0x01
#define DEQUEUE_SAVE		0x02

#define RETRY_TASK		((void *)-1UL)