Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e2df058 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

sched: Fix pick_next_task() vs 'change' pattern race



Commit 67692435 ("sched: Rework pick_next_task() slow-path")
inadvertly introduced a race because it changed a previously
unexplored dependency between dropping the rq->lock and
sched_class::put_prev_task().

The comments about dropping rq->lock, in for example
newidle_balance(), only mentions the task being current and ->on_cpu
being set. But when we look at the 'change' pattern (in for example
sched_setnuma()):

	queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */
	running = task_current(rq, p); /* rq->curr == p */

	if (queued)
		dequeue_task(...);
	if (running)
		put_prev_task(...);

	/* change task properties */

	if (queued)
		enqueue_task(...);
	if (running)
		set_next_task(...);

It becomes obvious that if we do this after put_prev_task() has
already been called on @p, things go sideways. This is exactly what
the commit in question allows to happen when it does:

	prev->sched_class->put_prev_task(rq, prev, rf);
	if (!rq->nr_running)
		newidle_balance(rq, rf);

The newidle_balance() call will drop rq->lock after we've called
put_prev_task() and that allows the above 'change' pattern to
interleave and mess up the state.

Furthermore, it turns out we lost the RT-pull when we put the last DL
task.

Fix both problems by extracting the balancing from put_prev_task() and
doing a multi-class balance() pass before put_prev_task().

Fixes: 67692435 ("sched: Rework pick_next_task() slow-path")
Reported-by: default avatarQuentin Perret <qperret@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarQuentin Perret <qperret@google.com>
Tested-by: default avatarValentin Schneider <valentin.schneider@arm.com>
parent e3b8b6a0
Loading
Loading
Loading
Loading
+15 −6
Original line number Diff line number Diff line
@@ -3929,13 +3929,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
	}

restart:
#ifdef CONFIG_SMP
	/*
	 * Ensure that we put DL/RT tasks before the pick loop, such that they
	 * can PULL higher prio tasks when we lower the RQ 'priority'.
	 * We must do the balancing pass before put_next_task(), such
	 * that when we release the rq->lock the task is in the same
	 * state as before we took rq->lock.
	 *
	 * We can terminate the balance pass as soon as we know there is
	 * a runnable task of @class priority or higher.
	 */
	prev->sched_class->put_prev_task(rq, prev, rf);
	if (!rq->nr_running)
		newidle_balance(rq, rf);
	for_class_range(class, prev->sched_class, &idle_sched_class) {
		if (class->balance(rq, prev, rf))
			break;
	}
#endif

	put_prev_task(rq, prev);

	for_each_class(class) {
		p = class->pick_next_task(rq, NULL, NULL);
@@ -6201,7 +6210,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
	for_each_class(class) {
		next = class->pick_next_task(rq, NULL, NULL);
		if (next) {
			next->sched_class->put_prev_task(rq, next, NULL);
			next->sched_class->put_prev_task(rq, next);
			return next;
		}
	}
+20 −20
Original line number Diff line number Diff line
@@ -1691,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
	resched_curr(rq);
}

static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
	if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
		/*
		 * This is OK, because current is on_cpu, which avoids it being
		 * picked for load-balance and preemption/IRQs are still
		 * disabled avoiding further scheduler activity on it and we've
		 * not yet started the picking loop.
		 */
		rq_unpin_lock(rq, rf);
		pull_dl_task(rq);
		rq_repin_lock(rq, rf);
	}

	return sched_stop_runnable(rq) || sched_dl_runnable(rq);
}
#endif /* CONFIG_SMP */

/*
@@ -1758,45 +1774,28 @@ static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	struct sched_dl_entity *dl_se;
	struct dl_rq *dl_rq = &rq->dl;
	struct task_struct *p;
	struct dl_rq *dl_rq;

	WARN_ON_ONCE(prev || rf);

	dl_rq = &rq->dl;

	if (unlikely(!dl_rq->dl_nr_running))
	if (!sched_dl_runnable(rq))
		return NULL;

	dl_se = pick_next_dl_entity(rq, dl_rq);
	BUG_ON(!dl_se);

	p = dl_task_of(dl_se);

	set_next_task_dl(rq, p);

	return p;
}

static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
	update_curr_dl(rq);

	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
		enqueue_pushable_dl_task(rq, p);

	if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
		/*
		 * This is OK, because current is on_cpu, which avoids it being
		 * picked for load-balance and preemption/IRQs are still
		 * disabled avoiding further scheduler activity on it and we've
		 * not yet started the picking loop.
		 */
		rq_unpin_lock(rq, rf);
		pull_dl_task(rq);
		rq_repin_lock(rq, rf);
	}
}

/*
@@ -2442,6 +2441,7 @@ const struct sched_class dl_sched_class = {
	.set_next_task		= set_next_task_dl,

#ifdef CONFIG_SMP
	.balance		= balance_dl,
	.select_task_rq		= select_task_rq_dl,
	.migrate_task_rq	= migrate_task_rq_dl,
	.set_cpus_allowed       = set_cpus_allowed_dl,
+12 −3
Original line number Diff line number Diff line
@@ -6570,6 +6570,15 @@ static void task_dead_fair(struct task_struct *p)
{
	remove_entity_load_avg(&p->se);
}

static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	if (rq->nr_running)
		return 1;

	return newidle_balance(rq, rf) != 0;
}
#endif /* CONFIG_SMP */

static unsigned long wakeup_gran(struct sched_entity *se)
@@ -6746,7 +6755,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
	int new_tasks;

again:
	if (!cfs_rq->nr_running)
	if (!sched_fair_runnable(rq))
		goto idle;

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6884,7 +6893,7 @@ done: __maybe_unused;
/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
	struct sched_entity *se = &prev->se;
	struct cfs_rq *cfs_rq;
@@ -10414,11 +10423,11 @@ const struct sched_class fair_sched_class = {
	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,

	.put_prev_task		= put_prev_task_fair,
	.set_next_task          = set_next_task_fair,

#ifdef CONFIG_SMP
	.balance		= balance_fair,
	.select_task_rq		= select_task_rq_fair,
	.migrate_task_rq	= migrate_task_rq_fair,

+8 −1
Original line number Diff line number Diff line
@@ -365,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
	return task_cpu(p); /* IDLE tasks as never migrated */
}

static int
balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	return WARN_ON_ONCE(1);
}
#endif

/*
@@ -375,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
	resched_curr(rq);
}

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}

@@ -460,6 +466,7 @@ const struct sched_class idle_sched_class = {
	.set_next_task          = set_next_task_idle,

#ifdef CONFIG_SMP
	.balance		= balance_idle,
	.select_task_rq		= select_task_rq_idle,
	.set_cpus_allowed	= set_cpus_allowed_common,
#endif
+19 −18
Original line number Diff line number Diff line
@@ -1469,6 +1469,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
	resched_curr(rq);
}

static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
	if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
		/*
		 * This is OK, because current is on_cpu, which avoids it being
		 * picked for load-balance and preemption/IRQs are still
		 * disabled avoiding further scheduler activity on it and we've
		 * not yet started the picking loop.
		 */
		rq_unpin_lock(rq, rf);
		pull_rt_task(rq);
		rq_repin_lock(rq, rf);
	}

	return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
}
#endif /* CONFIG_SMP */

/*
@@ -1552,21 +1568,18 @@ static struct task_struct *
pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	struct task_struct *p;
	struct rt_rq *rt_rq = &rq->rt;

	WARN_ON_ONCE(prev || rf);

	if (!rt_rq->rt_queued)
	if (!sched_rt_runnable(rq))
		return NULL;

	p = _pick_next_task_rt(rq);

	set_next_task_rt(rq, p);

	return p;
}

static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
	update_curr_rt(rq);

@@ -1578,18 +1591,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_fla
	 */
	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
		enqueue_pushable_task(rq, p);

	if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
		/*
		 * This is OK, because current is on_cpu, which avoids it being
		 * picked for load-balance and preemption/IRQs are still
		 * disabled avoiding further scheduler activity on it and we've
		 * not yet started the picking loop.
		 */
		rq_unpin_lock(rq, rf);
		pull_rt_task(rq);
		rq_repin_lock(rq, rf);
	}
}

#ifdef CONFIG_SMP
@@ -2366,8 +2367,8 @@ const struct sched_class rt_sched_class = {
	.set_next_task          = set_next_task_rt,

#ifdef CONFIG_SMP
	.balance		= balance_rt,
	.select_task_rq		= select_task_rq_rt,

	.set_cpus_allowed       = set_cpus_allowed_common,
	.rq_online              = rq_online_rt,
	.rq_offline             = rq_offline_rt,
Loading