Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9af6528e authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar
Browse files

sched/core: Optimize __schedule()



Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD
context switch, we can avoid the TASK_DEAD special case currently in
__schedule() because that avoids the extra preempt_disable() from
schedule().

In order to facilitate this, create a do_task_dead() helper which we
place in the scheduler code, such that it can access __schedule().

Also add some __noreturn annotations to the functions, there's no
coming back from do_exit().

Suggested-by: default avatarOleg Nesterov <oleg@redhat.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Cheng Chao <cs.os.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: chris@chris-wilson.co.uk
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent bf89a304
Loading
Loading
Loading
Loading
+3 −6
Original line number Diff line number Diff line
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state);
__printf(1, 2)
void panic(const char *fmt, ...)
	__noreturn __cold;
void panic(const char *fmt, ...) __noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void);
extern void oops_exit(void);
void print_oops_end_marker(void);
extern int oops_may_print(void);
void do_exit(long error_code)
	__noreturn;
void complete_and_exit(struct completion *, long)
	__noreturn;
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;

/* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
+2 −0
Original line number Diff line number Diff line
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
}

void __noreturn do_task_dead(void);

struct nsproxy;
struct user_namespace;

+2 −24
Original line number Diff line number Diff line
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif

void do_exit(long code)
void __noreturn do_exit(long code)
{
	struct task_struct *tsk = current;
	int group_dead;
@@ -882,29 +882,7 @@ void do_exit(long code)
	exit_rcu();
	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));

	/*
	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
	 * when the following two conditions become true.
	 *   - There is race condition of mmap_sem (It is acquired by
	 *     exit_mm()), and
	 *   - SMI occurs before setting TASK_RUNINNG.
	 *     (or hypervisor of virtual machine switches to other guest)
	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
	 *
	 * To avoid it, we have to wait for releasing tsk->pi_lock which
	 * is held by try_to_wake_up()
	 */
	smp_mb();
	raw_spin_unlock_wait(&tsk->pi_lock);

	/* causes final put_task_struct in finish_task_switch(). */
	tsk->state = TASK_DEAD;
	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
	schedule();
	BUG();
	/* Avoid "noreturn function does return".  */
	for (;;)
		cpu_relax();	/* For when BUG is null */
	do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);

+27 −11
Original line number Diff line number Diff line
@@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
	rq = cpu_rq(cpu);
	prev = rq->curr;

	/*
	 * do_exit() calls schedule() with preemption disabled as an exception;
	 * however we must fix that up, otherwise the next task will see an
	 * inconsistent (higher) preempt count.
	 *
	 * It also avoids the below schedule_debug() test from complaining
	 * about this.
	 */
	if (unlikely(prev->state == TASK_DEAD))
		preempt_enable_no_resched_notrace();

	schedule_debug(prev);

	if (sched_feat(HRTICK))
@@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
}
STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */

void __noreturn do_task_dead(void)
{
	/*
	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
	 * when the following two conditions become true.
	 *   - There is race condition of mmap_sem (It is acquired by
	 *     exit_mm()), and
	 *   - SMI occurs before setting TASK_RUNINNG.
	 *     (or hypervisor of virtual machine switches to other guest)
	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
	 *
	 * To avoid it, we have to wait for releasing tsk->pi_lock which
	 * is held by try_to_wake_up()
	 */
	smp_mb();
	raw_spin_unlock_wait(&current->pi_lock);

	/* causes final put_task_struct in finish_task_switch(). */
	__set_current_state(TASK_DEAD);
	current->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
	__schedule(false);
	BUG();
	/* Avoid "noreturn function does return".  */
	for (;;)
		cpu_relax();	/* For when BUG is null */
}

static inline void sched_submit_work(struct task_struct *tsk)
{
	if (!tsk->state || tsk_is_pi_blocked(tsk))