Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 76e4f660 authored by Ashok Raj's avatar Ashok Raj Committed by Linus Torvalds
Browse files

[PATCH] x86_64: CPU hotplug support



  Experimental CPU hotplug patch for x86_64
  -----------------------------------------
This supports logical CPU online and offline.
- Test with maxcpus=1, and then kick other cpu's off to test if init code
  is all cleaned up. CONFIG_SCHED_SMT works as well.
- idle threads are forked on demand from keventd threads for clean startup

TBD:
1. Not tested on a real NUMA machine (tested with numa=fake=2)
2. Handle ACPI pieces for physical hotplug support.

Signed-off-by: default avatarAshok Raj <ashok.raj@intel.com>
Acked-by: default avatarAndi Kleen <ak@muc.de>
Acked-by: default avatarZwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: default avatar <Shaohua.li&lt;shaohua.li@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent e6982c67
Loading
Loading
Loading
Loading
+7 −8
Original line number Diff line number Diff line
@@ -73,12 +73,11 @@ static int __init topology_init(void)
{
	int i;

	for (i = 0; i < MAX_NUMNODES; i++) {
		if (node_online(i))
	for_each_online_node(i)
		arch_register_node(i);
	}
	for (i = 0; i < NR_CPUS; i++)
		if (cpu_possible(i)) arch_register_cpu(i);

	for_each_cpu(i)
		arch_register_cpu(i);
	return 0;
}

@@ -88,8 +87,8 @@ static int __init topology_init(void)
{
	int i;

	for (i = 0; i < NR_CPUS; i++)
		if (cpu_possible(i)) arch_register_cpu(i);
	for_each_cpu(i)
		arch_register_cpu(i);
	return 0;
}

+9 −0
Original line number Diff line number Diff line
@@ -313,6 +313,15 @@ config NR_CPUS
	  This is purely to save memory - each supported CPU requires
	  memory in the static kernel configuration.

config HOTPLUG_CPU
	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
	depends on SMP && HOTPLUG && EXPERIMENTAL
	help
		Say Y here to experiment with turning CPUs off and on.  CPUs
		can be controlled through /sys/devices/system/cpu/cpu#.
		Say N if you want to disable CPU hotplug.


config HPET_TIMER
	bool
	default y
+29 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>

@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
	return 1;
}

#ifdef CONFIG_HOTPLUG_CPU
void fixup_irqs(cpumask_t map)
{
	unsigned int irq;
	static int warned;

	for (irq = 0; irq < NR_IRQS; irq++) {
		cpumask_t mask;
		if (irq == 2)
			continue;

		cpus_and(mask, irq_affinity[irq], map);
		if (any_online_cpu(mask) == NR_CPUS) {
			printk("Breaking affinity for irq %i\n", irq);
			mask = map;
		}
		if (irq_desc[irq].handler->set_affinity)
			irq_desc[irq].handler->set_affinity(irq, mask);
		else if (irq_desc[irq].action && !(warned++))
			printk("Cannot set affinity for irq %i\n", irq);
	}

	/* That doesn't seem sufficient.  Give it 1ms. */
	local_irq_enable();
	mdelay(1);
	local_irq_disable();
}
#endif
+28 −1
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
 *  X86-64 port
 *	Andi Kleen.
 *
 *	CPU hotplug support - ashok.raj@intel.com
 *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
 */

@@ -18,6 +19,7 @@

#include <stdarg.h>

#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);

#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);

#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
	idle_task_exit();
	wbinvd();
	mb();
	/* Ack it */
	__get_cpu_var(cpu_state) = CPU_DEAD;

	while (1)
		safe_halt();
}
#else
static inline void play_dead(void)
{
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */

/*
 * The idle thread. There's no useful work to be
 * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
			idle = pm_idle;
			if (!idle)
				idle = default_idle;
			if (cpu_is_offline(smp_processor_id()))
				play_dead();
			idle();
		}

+190 −15
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@
 *      Andi Kleen              :       Converted to new state machine.
 *					Various cleanups.
 *					Probably mostly hotplug CPU ready now.
 *	Ashok Raj			: CPU hotplug support
 */


@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
extern unsigned char trampoline_data[];
extern unsigned char trampoline_end[];

/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };

/*
 * Store all idle threads, this can be reused instead of creating
 * a new thread. Also avoids complicated thread destroy functionality
 * for idle threads.
 */
struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;

#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))

/*
 * cpu_possible_map should be static, it cannot change as cpu's
 * are onlined, or offlined. The reason is per-cpu data-structures
 * are allocated by some modules at init time, and dont expect to
 * do this dynamically on cpu arrival/departure.
 * cpu_present_map on the other hand can change dynamically.
 * In case when cpu_hotplug is not compiled, then we resort to current
 * behaviour, which is cpu_possible == cpu_present.
 * If cpu-hotplug is supported, then we need to preallocate for all
 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
 * - Ashok Raj
 */
#ifdef CONFIG_HOTPLUG_CPU
#define fixup_cpu_possible_map(x)	cpu_set((x), cpu_possible_map)
#else
#define fixup_cpu_possible_map(x)
#endif

/*
 * Currently trivial. Write the real->protected mode
 * bootstrap into the page concerned. The caller
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
	return (send_status | accept_status);
}

struct create_idle {
	struct task_struct *idle;
	struct completion done;
	int cpu;
};

void do_fork_idle(void *_c_idle)
{
	struct create_idle *c_idle = _c_idle;

	c_idle->idle = fork_idle(c_idle->cpu);
	complete(&c_idle->done);
}

/*
 * Boot one CPU.
 */
static int __cpuinit do_boot_cpu(int cpu, int apicid)
{
	struct task_struct *idle;
	unsigned long boot_error;
	int timeout;
	unsigned long start_rip;
	struct create_idle c_idle = {
		.cpu = cpu,
		.done = COMPLETION_INITIALIZER(c_idle.done),
	};
	DECLARE_WORK(work, do_fork_idle, &c_idle);

	c_idle.idle = get_idle_for_cpu(cpu);

	if (c_idle.idle) {
		c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
			(THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
		init_idle(c_idle.idle, cpu);
		goto do_rest;
	}

	/*
	 * We can't use kernel_thread since we must avoid to
	 * reschedule the child.
	 * During cold boot process, keventd thread is not spun up yet.
	 * When we do cpu hot-add, we create idle threads on the fly, we should
	 * not acquire any attributes from the calling context. Hence the clean
	 * way to create kernel_threads() is to do that from keventd().
	 * We do the current_is_keventd() due to the fact that ACPI notifier
	 * was also queuing to keventd() and when the caller is already running
	 * in context of keventd(), we would end up with locking up the keventd
	 * thread.
	 */
	idle = fork_idle(cpu);
	if (IS_ERR(idle)) {
	if (!keventd_up() || current_is_keventd())
		work.func(work.data);
	else {
		schedule_work(&work);
		wait_for_completion(&c_idle.done);
	}

	if (IS_ERR(c_idle.idle)) {
		printk("failed fork for CPU %d\n", cpu);
		return PTR_ERR(idle);
		return PTR_ERR(c_idle.idle);
	}

	cpu_pda[cpu].pcurrent = idle;
	set_idle_for_cpu(cpu, c_idle.idle);

do_rest:

	cpu_pda[cpu].pcurrent = c_idle.idle;

	start_rip = setup_trampoline();

	init_rsp = idle->thread.rsp;
	init_rsp = c_idle.idle->thread.rsp;
	per_cpu(init_tss,cpu).rsp0 = init_rsp;
	initial_code = start_secondary;
	clear_ti_thread_flag(idle->thread_info, TIF_FORK);
	clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);

	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
	       start_rip, init_rsp);
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
		int apicid = cpu_present_to_apicid(i);
		if (physid_isset(apicid, phys_cpu_present_map)) {
			cpu_set(i, cpu_present_map);
			/* possible map would be different if we supported real
			   CPU hotplug. */
			cpu_set(i, cpu_possible_map);
		}
		fixup_cpu_possible_map(i);
	}

	if (smp_sanity_check(max_cpus) < 0) {
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void)

/*
 * Entry point to boot a CPU.
 *
 * This is all __cpuinit, not __devinit for now because we don't support
 * CPU hotplug (yet).
 */
int __cpuinit __cpu_up(unsigned int cpu)
{
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
		return -EINVAL;
	}

	/*
	 * Already booted CPU?
	 */
 	if (cpu_isset(cpu, cpu_callin_map)) {
		Dprintk("do_boot_cpu %d Already started\n", cpu);
 		return -ENOSYS;
	}

	/* Boot it! */
	err = do_boot_cpu(cpu, apicid);
	if (err < 0) {
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu)

	while (!cpu_isset(cpu, cpu_online_map))
		cpu_relax();
	return 0;
	err = 0;

	return err;
}

/*
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
 */
void __init smp_cpus_done(unsigned int max_cpus)
{
#ifndef CONFIG_HOTPLUG_CPU
	zap_low_mappings();
#endif
	smp_cleanup_boot();

#ifdef CONFIG_X86_IO_APIC
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus)

	check_nmi_watchdog();
}

#ifdef CONFIG_HOTPLUG_CPU

static void
remove_siblinginfo(int cpu)
{
	int sibling;

	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
		cpu_clear(cpu, cpu_sibling_map[sibling]);
	for_each_cpu_mask(sibling, cpu_core_map[cpu])
		cpu_clear(cpu, cpu_core_map[sibling]);
	cpus_clear(cpu_sibling_map[cpu]);
	cpus_clear(cpu_core_map[cpu]);
	phys_proc_id[cpu] = BAD_APICID;
	cpu_core_id[cpu] = BAD_APICID;
}

void remove_cpu_from_maps(void)
{
	int cpu = smp_processor_id();

	cpu_clear(cpu, cpu_callout_map);
	cpu_clear(cpu, cpu_callin_map);
	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
}

int __cpu_disable(void)
{
	int cpu = smp_processor_id();

	/*
	 * Perhaps use cpufreq to drop frequency, but that could go
	 * into generic code.
 	 *
	 * We won't take down the boot processor on i386 due to some
	 * interrupts only being able to be serviced by the BSP.
	 * Especially so if we're not using an IOAPIC	-zwane
	 */
	if (cpu == 0)
		return -EBUSY;

	disable_APIC_timer();

	/*
	 * HACK:
	 * Allow any queued timer interrupts to get serviced
	 * This is only a temporary solution until we cleanup
	 * fixup_irqs as we do for IA64.
	 */
	local_irq_enable();
	mdelay(1);

	local_irq_disable();
	remove_siblinginfo(cpu);

	/* It's now safe to remove this processor from the online map */
	cpu_clear(cpu, cpu_online_map);
	remove_cpu_from_maps();
	fixup_irqs(cpu_online_map);
	return 0;
}

void __cpu_die(unsigned int cpu)
{
	/* We don't do anything here: idle task is faking death itself. */
	unsigned int i;

	for (i = 0; i < 10; i++) {
		/* They ack this in play_dead by setting CPU_DEAD */
		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
			return;
		current->state = TASK_UNINTERRUPTIBLE;
		schedule_timeout(HZ/10);
	}
 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}

#else /* ... !CONFIG_HOTPLUG_CPU */

int __cpu_disable(void)
{
	return -ENOSYS;
}

void __cpu_die(unsigned int cpu)
{
	/* We said "no" in __cpu_disable */
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
Loading