Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 37407ea7 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations"



This reverts commit 970e1789.

Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20%
performance drop on PostgreSQL 9.2 on his machine (running "pgbench").

Borislav Petkov was able to reproduce this, and bisected it to this
commit 970e1789 ("sched: Improve scalability via 'CPU buddies' ...")
apparently because the new single-idle-buddy model simply doesn't find
idle CPU's to reschedule on aggressively enough.

Mike Galbraith suspects that it is likely due to the user-mode spinlocks
in PostgreSQL not reacting well to preemption, but we don't really know
the details - I'll just revert the commit for now.

There are hopefully other approaches to improve scheduler scalability
without it causing these kinds of downsides.

Reported-by: default avatarNikolay Ulyanitsky <lystor@gmail.com>
Bisected-by: default avatarBorislav Petkov <bp@alien8.de>
Acked-by: default avatarMike Galbraith <efault@gmx.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3f0c3c8f
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -954,7 +954,6 @@ struct sched_domain {
	unsigned int smt_gain;
	int flags;			/* See SD_* */
	int level;
	int idle_buddy;			/* cpu assigned to select_idle_sibling() */

	/* Runtime fields. */
	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+1 −38
Original line number Diff line number Diff line
@@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
 * Iterate domains and sched_groups downward, assigning CPUs to be
 * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
 * due to random perturbation self canceling, ie sw buddies pull
 * their counterpart to their CPU's hw counterpart.
 *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu)
	int id = cpu;

	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
	if (sd) {
		struct sched_domain *tmp = sd;
		struct sched_group *sg, *prev;
		bool right;

		/*
		 * Traverse to first CPU in group, and count hops
		 * to cpu from there, switching direction on each
		 * hop, never ever pointing the last CPU rightward.
		 */
		do {
			id = cpumask_first(sched_domain_span(tmp));
			prev = sg = tmp->groups;
			right = 1;

			while (cpumask_first(sched_group_cpus(sg)) != id)
				sg = sg->next;

			while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
				prev = sg;
				sg = sg->next;
				right = !right;
			}

			/* A CPU went down, never point back to domain start. */
			if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
				right = false;

			sg = right ? sg->next : prev;
			tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
		} while ((tmp = tmp->child));

	if (sd)
		id = cpumask_first(sched_domain_span(sd));
	}

	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
	per_cpu(sd_llc_id, cpu) = id;
+21 −7
Original line number Diff line number Diff line
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
	int cpu = smp_processor_id();
	int prev_cpu = task_cpu(p);
	struct sched_domain *sd;
	struct sched_group *sg;
	int i;

	/*
	 * If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
		return prev_cpu;

	/*
	 * Otherwise, check assigned siblings to find an elegible idle cpu.
	 * Otherwise, iterate the domains and find an elegible idle cpu.
	 */
	sd = rcu_dereference(per_cpu(sd_llc, target));

	for_each_lower_domain(sd) {
		if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
			continue;
		if (idle_cpu(sd->idle_buddy))
			return sd->idle_buddy;
		sg = sd->groups;
		do {
			if (!cpumask_intersects(sched_group_cpus(sg),
						tsk_cpus_allowed(p)))
				goto next;

			for_each_cpu(i, sched_group_cpus(sg)) {
				if (!idle_cpu(i))
					goto next;
			}

			target = cpumask_first_and(sched_group_cpus(sg),
					tsk_cpus_allowed(p));
			goto done;
next:
			sg = sg->next;
		} while (sg != sd->groups);
	}
done:
	return target;
}