Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f68f447e authored by John Hawkes's avatar John Hawkes Committed by Linus Torvalds
Browse files

[PATCH] ia64 cpuset + build_sched_domains() mangles structures



I've already sent this to the maintainers, and this is now being sent to a
larger community audience.  I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.

The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.

Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset.  Declaring a
cpuset that violates this presumption will produce flawed data structures
and will oops the kernel.

To trigger the problem (on a NUMA system with >1 CPUs per node):
   cd /dev/cpuset
   mkdir newcpuset
   cd newcpuset
   echo 0 >cpus
   echo 0 >mems
   echo 1 >cpu_exclusive

I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix
involves dynamically allocating sched_group_nodes[] and
sched_group_allnodes[] for each invocation of build_sched_domains(), rather
than using global arrays for these structures.  Care must be taken to
remember kmalloc() addresses so that arch_destroy_sched_domains() can
properly kfree() the new dynamic structures.

Signed-off-by: default avatarJohn Hawkes <hawkes@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 38f18527
Loading
Loading
Loading
Loading
+69 −21
Original line number Diff line number Diff line
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
 * gets dynamically allocated.
 */
static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group *sched_group_nodes[MAX_NUMNODES];
static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
static struct sched_group sched_group_allnodes[MAX_NUMNODES];
static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];

static int cpu_to_allnodes_group(int cpu)
{
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
void build_sched_domains(const cpumask_t *cpu_map)
{
	int i;
#ifdef CONFIG_NUMA
	struct sched_group **sched_group_nodes = NULL;
	struct sched_group *sched_group_allnodes = NULL;

	/*
	 * Allocate the per-node list of sched groups
	 */
	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
					   GFP_ATOMIC);
	if (!sched_group_nodes) {
		printk(KERN_WARNING "Can not alloc sched group node list\n");
		return;
	}
	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
#endif

	/*
	 * Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
		cpus_and(nodemask, nodemask, *cpu_map);

#ifdef CONFIG_NUMA
		if (num_online_cpus()
		if (cpus_weight(*cpu_map)
				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
			if (!sched_group_allnodes) {
				sched_group_allnodes
					= kmalloc(sizeof(struct sched_group)
							* MAX_NUMNODES,
						  GFP_KERNEL);
				if (!sched_group_allnodes) {
					printk(KERN_WARNING
					"Can not alloc allnodes sched group\n");
					break;
				}
				sched_group_allnodes_bycpu[i]
						= sched_group_allnodes;
			}
			sd = &per_cpu(allnodes_domains, i);
			*sd = SD_ALLNODES_INIT;
			sd->span = *cpu_map;
@@ -214,6 +242,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
	}

#ifdef CONFIG_NUMA
	if (sched_group_allnodes)
		init_sched_build_groups(sched_group_allnodes, *cpu_map,
					&cpu_to_allnodes_group);

@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
		int j;

		cpus_and(nodemask, nodemask, *cpu_map);
		if (cpus_empty(nodemask))
		if (cpus_empty(nodemask)) {
			sched_group_nodes[i] = NULL;
			continue;
		}

		domainspan = sched_domain_node_span(i);
		cpus_and(domainspan, domainspan, *cpu_map);
@@ -372,6 +403,22 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
#ifdef CONFIG_NUMA
	int i;
	int cpu;

	for_each_cpu_mask(cpu, *cpu_map) {
		struct sched_group *sched_group_allnodes
			= sched_group_allnodes_bycpu[cpu];
		struct sched_group **sched_group_nodes
			= sched_group_nodes_bycpu[cpu];

		if (sched_group_allnodes) {
			kfree(sched_group_allnodes);
			sched_group_allnodes_bycpu[cpu] = NULL;
		}

		if (!sched_group_nodes)
			continue;

		for (i = 0; i < MAX_NUMNODES; i++) {
			cpumask_t nodemask = node_to_cpumask(i);
			struct sched_group *oldsg, *sg = sched_group_nodes[i];
@@ -389,8 +436,9 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
			kfree(oldsg);
			if (oldsg != sched_group_nodes[i])
				goto next_sg;
		sched_group_nodes[i] = NULL;
		}
		kfree(sched_group_nodes);
		sched_group_nodes_bycpu[cpu] = NULL;
	}
#endif
}