Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 49d2953c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "Major changes:

   - Reworked CPU capacity code, for better SMP load balancing on
     systems with assymetric CPUs. (Vincent Guittot, Morten Rasmussen)

   - Reworked RT task SMP balancing to be push based instead of pull
     based, to reduce latencies on large CPU count systems. (Steven
     Rostedt)

   - SCHED_DEADLINE support updates and fixes. (Juri Lelli)

   - SCHED_DEADLINE task migration support during CPU hotplug. (Wanpeng Li)

   - x86 mwait-idle optimizations and fixes. (Mike Galbraith, Len Brown)

   - sched/numa improvements. (Rik van Riel)

   - various cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits)
  sched/core: Drop debugging leftover trace_printk call
  sched/deadline: Support DL task migration during CPU hotplug
  sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()
  sched/deadline: Always enqueue on previous rq when dl_task_timer() fires
  sched/core: Remove unused argument from init_[rt|dl]_rq()
  sched/deadline: Fix rt runtime corruption when dl fails its global constraints
  sched/deadline: Avoid a superfluous check
  sched: Improve load balancing in the presence of idle CPUs
  sched: Optimize freq invariant accounting
  sched: Move CFS tasks to CPUs with higher capacity
  sched: Add SD_PREFER_SIBLING for SMT level
  sched: Remove unused struct sched_group_capacity::capacity_orig
  sched: Replace capacity_factor by usage
  sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage
  sched: Add struct rq::cpu_capacity_orig
  sched: Make scale_rt invariant with frequency
  sched: Make sched entity usage tracking scale-invariant
  sched: Remove frequency scaling from cpu_capacity
  sched: Track group sched_entity usage contributions
  sched: Add sched_avg::utilization_avg_contrib
  ...
parents cc76ee75 62a935b2
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
		     :: "a" (eax), "c" (ecx));
}

static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
	trace_hardirqs_on();
	/* "mwait %eax, %ecx;" */
	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
		     :: "a" (eax), "c" (ecx));
}

/*
 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
 * which can obviate IPI to trigger checking of need_resched.
+51 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/mwait.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
@@ -399,6 +400,53 @@ static void amd_e400_idle(void)
		default_idle();
}

/*
 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
 * We can't rely on cpuidle installing MWAIT, because it will not load
 * on systems that support only C1 -- so the boot default must be MWAIT.
 *
 * Some AMD machines are the opposite, they depend on using HALT.
 *
 * So for default C1, which is used during boot until cpuidle loads,
 * use MWAIT-C1 on Intel HW that has it, else use HALT.
 */
static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
{
	if (c->x86_vendor != X86_VENDOR_INTEL)
		return 0;

	if (!cpu_has(c, X86_FEATURE_MWAIT))
		return 0;

	return 1;
}

/*
 * MONITOR/MWAIT with no hints, used for default default C1 state.
 * This invokes MWAIT with interrutps enabled and no flags,
 * which is backwards compatible with the original MWAIT implementation.
 */

static void mwait_idle(void)
{
	if (!current_set_polling_and_test()) {
		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
			smp_mb(); /* quirk */
			clflush((void *)&current_thread_info()->flags);
			smp_mb(); /* quirk */
		}

		__monitor((void *)&current_thread_info()->flags, 0, 0);
		if (!need_resched())
			__sti_mwait(0, 0);
		else
			local_irq_enable();
	} else {
		local_irq_enable();
	}
	__current_clr_polling();
}

void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
@@ -412,6 +460,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
		/* E400: APIC timer interrupt does not wake up CPU from C1e */
		pr_info("using AMD E400 aware idle routine\n");
		x86_idle = amd_e400_idle;
	} else if (prefer_mwait_c1_over_halt(c)) {
		pr_info("using mwait in idle threads\n");
		x86_idle = mwait_idle;
	} else
		x86_idle = default_idle;
}
+2 −1
Original line number Diff line number Diff line
@@ -38,16 +38,17 @@ bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
#endif

void irq_work_run(void);
void irq_work_tick(void);
void irq_work_sync(struct irq_work *work);

#ifdef CONFIG_IRQ_WORK
#include <asm/irq_work.h>

void irq_work_run(void);
bool irq_work_needs_cpu(void);
#else
static inline bool irq_work_needs_cpu(void) { return false; }
static inline void irq_work_run(void) { }
#endif

#endif /* _LINUX_IRQ_WORK_H */
+17 −4
Original line number Diff line number Diff line
@@ -1123,15 +1123,28 @@ struct load_weight {
};

struct sched_avg {
	u64 last_runnable_update;
	s64 decay_count;
	/*
	 * utilization_avg_contrib describes the amount of time that a
	 * sched_entity is running on a CPU. It is based on running_avg_sum
	 * and is scaled in the range [0..SCHED_LOAD_SCALE].
	 * load_avg_contrib described the amount of time that a sched_entity
	 * is runnable on a rq. It is based on both runnable_avg_sum and the
	 * weight of the task.
	 */
	unsigned long load_avg_contrib, utilization_avg_contrib;
	/*
	 * These sums represent an infinite geometric series and so are bound
	 * above by 1024/(1-y).  Thus we only need a u32 to store them for all
	 * choices of y < 1-2^(-32)*1024.
	 * running_avg_sum reflects the time that the sched_entity is
	 * effectively running on the CPU.
	 * runnable_avg_sum represents the amount of time a sched_entity is on
	 * a runqueue which includes the running time that is monitored by
	 * running_avg_sum.
	 */
	u32 runnable_avg_sum, runnable_avg_period;
	u64 last_runnable_update;
	s64 decay_count;
	unsigned long load_avg_contrib;
	u32 runnable_avg_sum, avg_period, running_avg_sum;
};

#ifdef CONFIG_SCHEDSTATS
+50 −46
Original line number Diff line number Diff line
@@ -689,6 +689,23 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
	/*
	 * FIFO realtime policy runs the highest priority task. Other runnable
	 * tasks are of a lower priority. The scheduler tick does nothing.
	 */
	if (current->policy == SCHED_FIFO)
		return true;

	/*
	 * Round-robin realtime tasks time slice with other tasks at the same
	 * realtime priority. Is this task the only one at this priority?
	 */
	if (current->policy == SCHED_RR) {
		struct sched_rt_entity *rt_se = &current->rt;

		return rt_se->run_list.prev == rt_se->run_list.next;
	}

	/*
	 * More than one running task need preemption.
	 * nr_running update is assumed to be visible
@@ -5335,37 +5352,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
					unsigned long action, void *hcpu)
{
	unsigned long flags;
	long cpu = (long)hcpu;
	struct dl_bw *dl_b;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_DOWN_PREPARE:
		set_cpu_active(cpu, false);

		/* explicitly allow suspend */
		if (!(action & CPU_TASKS_FROZEN)) {
			bool overflow;
			int cpus;

			rcu_read_lock_sched();
			dl_b = dl_bw_of(cpu);

			raw_spin_lock_irqsave(&dl_b->lock, flags);
			cpus = dl_bw_cpus(cpu);
			overflow = __dl_overflow(dl_b, cpus, 0, 0);
			raw_spin_unlock_irqrestore(&dl_b->lock, flags);

			rcu_read_unlock_sched();

			if (overflow)
				return notifier_from_errno(-EBUSY);
		}
		set_cpu_active((long)hcpu, false);
		return NOTIFY_OK;
	}

	default:
		return NOTIFY_DONE;
	}
}

static int __init migration_init(void)
{
@@ -5445,17 +5439,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
			break;
		}

		/*
		 * Even though we initialize ->capacity to something semi-sane,
		 * we leave capacity_orig unset. This allows us to detect if
		 * domain iteration is still funny without causing /0 traps.
		 */
		if (!group->sgc->capacity_orig) {
			printk(KERN_CONT "\n");
			printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
			break;
		}

		if (!cpumask_weight(sched_group_cpus(group))) {
			printk(KERN_CONT "\n");
			printk(KERN_ERR "ERROR: empty group\n");
@@ -5939,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
		 * die on a /0 trap.
		 */
		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
		sg->sgc->capacity_orig = sg->sgc->capacity;

		/*
		 * Make sure the first group of this domain contains the
@@ -6250,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
	 */

	if (sd->flags & SD_SHARE_CPUCAPACITY) {
		sd->flags |= SD_PREFER_SIBLING;
		sd->imbalance_pct = 110;
		sd->smt_gain = 1178; /* ~15% */

@@ -7015,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
		 */

	case CPU_ONLINE:
	case CPU_DOWN_FAILED:
		cpuset_update_active_cpus(true);
		break;
	default:
@@ -7027,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
			       void *hcpu)
{
	switch (action) {
	unsigned long flags;
	long cpu = (long)hcpu;
	struct dl_bw *dl_b;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_DOWN_PREPARE:
		/* explicitly allow suspend */
		if (!(action & CPU_TASKS_FROZEN)) {
			bool overflow;
			int cpus;

			rcu_read_lock_sched();
			dl_b = dl_bw_of(cpu);

			raw_spin_lock_irqsave(&dl_b->lock, flags);
			cpus = dl_bw_cpus(cpu);
			overflow = __dl_overflow(dl_b, cpus, 0, 0);
			raw_spin_unlock_irqrestore(&dl_b->lock, flags);

			rcu_read_unlock_sched();

			if (overflow)
				return notifier_from_errno(-EBUSY);
		}
		cpuset_update_active_cpus(false);
		break;
	case CPU_DOWN_PREPARE_FROZEN:
@@ -7173,8 +7177,8 @@ void __init sched_init(void)
		rq->calc_load_active = 0;
		rq->calc_load_update = jiffies + LOAD_FREQ;
		init_cfs_rq(&rq->cfs);
		init_rt_rq(&rq->rt, rq);
		init_dl_rq(&rq->dl, rq);
		init_rt_rq(&rq->rt);
		init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7214,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
		rq->sd = NULL;
		rq->rd = NULL;
		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
		rq->post_schedule = 0;
		rq->active_balance = 0;
		rq->next_balance = jiffies;
@@ -7813,7 +7817,7 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */

static int sched_dl_global_constraints(void)
static int sched_dl_global_validate(void)
{
	u64 runtime = global_rt_runtime();
	u64 period = global_rt_period();
@@ -7914,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
		if (ret)
			goto undo;

		ret = sched_rt_global_constraints();
		ret = sched_dl_global_validate();
		if (ret)
			goto undo;

		ret = sched_dl_global_constraints();
		ret = sched_rt_global_constraints();
		if (ret)
			goto undo;

Loading