Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 63b0e9ed authored by Mike Galbraith's avatar Mike Galbraith Committed by Ingo Molnar
Browse files

sched/fair: Beef up wake_wide()



Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU.  While looking at wake_wide(),
I noticed that it doesn't pay attention to the wakeup of a many partner
waker, returning 1 only when waking one of its many partners.

Correct that, letting explicit domain flags override the heuristic.

While at it, adjust task_struct bits, we don't need a 64-bit counter.

Tested-by: default avatarJosef Bacik <jbacik@fb.com>
Signed-off-by: default avatarMike Galbraith <umgwanakikbuti@gmail.com>
[ Tidy things up. ]
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team<Kernel-team@fb.com>
Cc: morten.rasmussen@arm.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent fbd705a0
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -1359,9 +1359,9 @@ struct task_struct {
#ifdef CONFIG_SMP
	struct llist_node wake_entry;
	int on_cpu;
	struct task_struct *last_wakee;
	unsigned long wakee_flips;
	unsigned int wakee_flips;
	unsigned long wakee_flip_decay_ts;
	struct task_struct *last_wakee;

	int wake_cpu;
#endif
+33 −34
Original line number Diff line number Diff line
@@ -4726,26 +4726,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

#endif

/*
 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
 * A waker of many should wake a different task than the one last awakened
 * at a frequency roughly N times higher than one of its wakees.  In order
 * to determine whether we should let the load spread vs consolodating to
 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
 * partner, and a factor of lls_size higher frequency in the other.  With
 * both conditions met, we can be relatively sure that the relationship is
 * non-monogamous, with partner count exceeding socket size.  Waker/wakee
 * being client/server, worker/dispatcher, interrupt source or whatever is
 * irrelevant, spread criteria is apparent partner count exceeds socket size.
 */
static int wake_wide(struct task_struct *p)
{
	unsigned int master = current->wakee_flips;
	unsigned int slave = p->wakee_flips;
	int factor = this_cpu_read(sd_llc_size);

	/*
	 * Yeah, it's the switching-frequency, could means many wakee or
	 * rapidly switch, use factor here will just help to automatically
	 * adjust the loose-degree, so bigger node will lead to more pull.
	 */
	if (p->wakee_flips > factor) {
		/*
		 * wakee is somewhat hot, it needs certain amount of cpu
		 * resource, so if waker is far more hot, prefer to leave
		 * it alone.
		 */
		if (current->wakee_flips > (factor * p->wakee_flips))
			return 1;
	}

	if (master < slave)
		swap(master, slave);
	if (slave < factor || master < slave * factor)
		return 0;
	return 1;
}

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4757,13 +4760,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
	unsigned long weight;
	int balanced;

	/*
	 * If we wake multiple tasks be careful to not bounce
	 * ourselves around too much.
	 */
	if (wake_wide(p))
		return 0;

	idx	  = sd->wake_idx;
	this_cpu  = smp_processor_id();
	prev_cpu  = task_cpu(p);
@@ -5017,17 +5013,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
{
	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
	int cpu = smp_processor_id();
	int new_cpu = cpu;
	int new_cpu = prev_cpu;
	int want_affine = 0;
	int sync = wake_flags & WF_SYNC;

	if (sd_flag & SD_BALANCE_WAKE)
		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));

	rcu_read_lock();
	for_each_domain(cpu, tmp) {
		if (!(tmp->flags & SD_LOAD_BALANCE))
			continue;
			break;

		/*
		 * If both cpu and prev_cpu are part of this domain,
@@ -5041,17 +5037,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f

		if (tmp->flags & sd_flag)
			sd = tmp;
		else if (!want_affine)
			break;
	}

	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
		prev_cpu = cpu;

	if (sd_flag & SD_BALANCE_WAKE) {
		new_cpu = select_idle_sibling(p, prev_cpu);
		goto unlock;
	if (affine_sd) {
		sd = NULL; /* Prefer wake_affine over balance flags */
		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
			new_cpu = cpu;
	}

	while (sd) {
	if (!sd) {
		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
			new_cpu = select_idle_sibling(p, new_cpu);

	} else while (sd) {
		struct sched_group *group;
		int weight;

@@ -5085,7 +5085,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
		}
		/* while loop will break here if sd == NULL */
	}
unlock:
	rcu_read_unlock();

	return new_cpu;