Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit af79ad2b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes are:

   - irqtime accounting cleanups and enhancements. (Frederic Weisbecker)

   - schedstat debugging enhancements, make it more broadly runtime
     available. (Josh Poimboeuf)

   - More work on asymmetric topology/capacity scheduling. (Morten
     Rasmussen)

   - sched/wait fixes and cleanups. (Oleg Nesterov)

   - PELT (per entity load tracking) improvements. (Peter Zijlstra)

   - Rewrite and enhance select_idle_siblings(). (Peter Zijlstra)

   - sched/numa enhancements/fixes (Rik van Riel)

   - sched/cputime scalability improvements (Stanislaw Gruszka)

   - Load calculation arithmetics fixes. (Dietmar Eggemann)

   - sched/deadline enhancements (Tommaso Cucinotta)

   - Fix utilization accounting when switching to the SCHED_NORMAL
     policy. (Vincent Guittot)

   - ... plus misc cleanups and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/irqtime: Consolidate irqtime flushing code
  sched/irqtime: Consolidate accounting synchronization with u64_stats API
  u64_stats: Introduce IRQs disabled helpers
  sched/irqtime: Remove needless IRQs disablement on kcpustat update
  sched/irqtime: No need for preempt-safe accessors
  sched/fair: Fix min_vruntime tracking
  sched/debug: Add SCHED_WARN_ON()
  sched/core: Fix set_user_nice()
  sched/fair: Introduce set_curr_task() helper
  sched/core, ia64: Rename set_curr_task()
  sched/core: Fix incorrect utilization accounting when switching to fair class
  sched/core: Optimize SCHED_SMT
  sched/core: Rewrite and improve select_idle_siblings()
  sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared
  sched/core: Introduce 'struct sched_domain_shared'
  sched/core: Restructure destroy_sched_domain()
  sched/core: Remove unused @cpu argument from destroy_sched_domain*()
  sched/wait: Introduce init_wait_entry()
  sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock()
  sched/wait: Avoid abort_exclusive_wait() in ___wait_event()
  ...
parents e606d81d 447976ef
Loading
Loading
Loading
Loading
+18 −0
Original line number Original line Diff line number Diff line
@@ -16,6 +16,7 @@ CONTENTS
   4.1 System-wide settings
   4.1 System-wide settings
   4.2 Task interface
   4.2 Task interface
   4.3 Default behavior
   4.3 Default behavior
   4.4 Behavior of sched_yield()
 5. Tasks CPU affinity
 5. Tasks CPU affinity
   5.1 SCHED_DEADLINE and cpusets HOWTO
   5.1 SCHED_DEADLINE and cpusets HOWTO
 6. Future plans
 6. Future plans
@@ -426,6 +427,23 @@ CONTENTS
 Finally, notice that in order not to jeopardize the admission control a
 Finally, notice that in order not to jeopardize the admission control a
 -deadline task cannot fork.
 -deadline task cannot fork.



4.4 Behavior of sched_yield()
-----------------------------

 When a SCHED_DEADLINE task calls sched_yield(), it gives up its
 remaining runtime and is immediately throttled, until the next
 period, when its runtime will be replenished (a special flag
 dl_yielded is set and used to handle correctly throttling and runtime
 replenishment after a call to sched_yield()).

 This behavior of sched_yield() allows the task to wake-up exactly at
 the beginning of the next period. Also, this may be useful in the
 future with bandwidth reclaiming mechanisms, where sched_yield() will
 make the leftoever runtime available for reclamation by other
 SCHED_DEADLINE tasks.


5. Tasks CPU affinity
5. Tasks CPU affinity
=====================
=====================


+5 −5
Original line number Original line Diff line number Diff line
@@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
	int cpu = smp_processor_id();
	int cpu = smp_processor_id();


	previous_current = curr_task(cpu);
	previous_current = curr_task(cpu);
	set_curr_task(cpu, current);
	ia64_set_curr_task(cpu, current);
	if ((p = strchr(current->comm, ' ')))
	if ((p = strchr(current->comm, ' ')))
		*p = '\0';
		*p = '\0';


@@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
				while (monarch_cpu != -1)
				while (monarch_cpu != -1)
					cpu_relax();	/* spin until last cpu leaves */
					cpu_relax();	/* spin until last cpu leaves */
				set_curr_task(cpu, previous_current);
				ia64_set_curr_task(cpu, previous_current);
				ia64_mc_info.imi_rendez_checkin[cpu]
				ia64_mc_info.imi_rendez_checkin[cpu]
						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
				return;
				return;
			}
			}
		}
		}
	}
	}
	set_curr_task(cpu, previous_current);
	ia64_set_curr_task(cpu, previous_current);
	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
}
}
@@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);


		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
		set_curr_task(cpu, previous_current);
		ia64_set_curr_task(cpu, previous_current);
		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
		atomic_dec(&slaves);
		atomic_dec(&slaves);
		return;
		return;
@@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,


	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
	atomic_dec(&monarchs);
	atomic_dec(&monarchs);
	set_curr_task(cpu, previous_current);
	ia64_set_curr_task(cpu, previous_current);
	monarch_cpu = -1;
	monarch_cpu = -1;
	return;
	return;
}
}
+30 −16
Original line number Original line Diff line number Diff line
@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
	return false;
	return false;
}
}


static struct sched_domain_topology_level numa_inside_package_topology[] = {
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
#ifdef CONFIG_SCHED_SMT
	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#endif
@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif
#endif
	{ NULL, },
	{ NULL, },
};
};

static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
	{ NULL, },
};

/*
/*
 * set_sched_topology() sets the topology internal to a CPU.  The
 * Set if a package/die has multiple NUMA nodes inside.
 * NUMA topologies are layered on top of it to build the full
 * AMD Magny-Cours and Intel Cluster-on-Die have this.
 * system topology.
 */
 *
static bool x86_has_numa_in_package;
 * If NUMA nodes are observed to occur within a CPU package, this
 * function should be called.  It forces the sched domain code to
 * only use the SMT level for the CPU portion of the topology.
 * This essentially falls back to relying on NUMA information
 * from the SRAT table to describe the entire system topology
 * (except for hyperthreads).
 */
static void primarily_use_numa_for_topology(void)
{
	set_sched_topology(numa_inside_package_topology);
}


void set_cpu_sibling_map(int cpu)
void set_cpu_sibling_map(int cpu)
{
{
@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
				c->booted_cores = cpu_data(i).booted_cores;
				c->booted_cores = cpu_data(i).booted_cores;
		}
		}
		if (match_die(c, o) && !topology_same_node(c, o))
		if (match_die(c, o) && !topology_same_node(c, o))
			primarily_use_numa_for_topology();
			x86_has_numa_in_package = true;
	}
	}


	threads = cpumask_weight(topology_sibling_cpumask(cpu));
	threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
	}
	}

	/*
	 * Set 'default' x86 topology, this matches default_topology() in that
	 * it has NUMA nodes as a topology level. See also
	 * native_smp_cpus_done().
	 *
	 * Must be done before set_cpus_sibling_map() is ran.
	 */
	set_sched_topology(x86_topology);

	set_cpu_sibling_map(0);
	set_cpu_sibling_map(0);


	switch (smp_sanity_check(max_cpus)) {
	switch (smp_sanity_check(max_cpus)) {
@@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
{
	pr_debug("Boot done\n");
	pr_debug("Boot done\n");


	if (x86_has_numa_in_package)
		set_sched_topology(x86_numa_in_package_topology);

	nmi_selftest();
	nmi_selftest();
	impress_friends();
	impress_friends();
	setup_ioapic_dest();
	setup_ioapic_dest();
+3 −6
Original line number Original line Diff line number Diff line
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list;
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state);
extern long (*panic_blink)(int state);
__printf(1, 2)
__printf(1, 2)
void panic(const char *fmt, ...)
void panic(const char *fmt, ...) __noreturn __cold;
	__noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg);
void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void);
extern void oops_enter(void);
extern void oops_exit(void);
extern void oops_exit(void);
void print_oops_end_marker(void);
void print_oops_end_marker(void);
extern int oops_may_print(void);
extern int oops_may_print(void);
void do_exit(long error_code)
void do_exit(long error_code) __noreturn;
	__noreturn;
void complete_and_exit(struct completion *, long) __noreturn;
void complete_and_exit(struct completion *, long)
	__noreturn;


/* Internal, do not use. */
/* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
+28 −2
Original line number Original line Diff line number Diff line
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
}
}


void __noreturn do_task_dead(void);

struct nsproxy;
struct nsproxy;
struct user_namespace;
struct user_namespace;


@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
#define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
#define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
@@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;


struct sched_group;
struct sched_group;


struct sched_domain_shared {
	atomic_t	ref;
	atomic_t	nr_busy_cpus;
	int		has_idle_cores;
};

struct sched_domain {
struct sched_domain {
	/* These fields must be setup */
	/* These fields must be setup */
	struct sched_domain *parent;	/* top domain must be null terminated */
	struct sched_domain *parent;	/* top domain must be null terminated */
@@ -1094,6 +1103,8 @@ struct sched_domain {
	u64 max_newidle_lb_cost;
	u64 max_newidle_lb_cost;
	unsigned long next_decay_max_lb_cost;
	unsigned long next_decay_max_lb_cost;


	u64 avg_scan_cost;		/* select_idle_sibling */

#ifdef CONFIG_SCHEDSTATS
#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1132,6 +1143,7 @@ struct sched_domain {
		void *private;		/* used during construction */
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
		struct rcu_head rcu;	/* used during destruction */
	};
	};
	struct sched_domain_shared *shared;


	unsigned int span_weight;
	unsigned int span_weight;
	/*
	/*
@@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);


struct sd_data {
struct sd_data {
	struct sched_domain **__percpu sd;
	struct sched_domain **__percpu sd;
	struct sched_domain_shared **__percpu sds;
	struct sched_group **__percpu sg;
	struct sched_group **__percpu sg;
	struct sched_group_capacity **__percpu sgc;
	struct sched_group_capacity **__percpu sgc;
};
};
@@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
	return p->pid == 0;
	return p->pid == 0;
}
}
extern struct task_struct *curr_task(int cpu);
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);


void yield(void);
void yield(void);


@@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 * cond_resched_lock() will drop the spinlock before scheduling,
 * cond_resched_lock() will drop the spinlock before scheduling,
 * cond_resched_softirq() will enable bhs before scheduling.
 * cond_resched_softirq() will enable bhs before scheduling.
 */
 */
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif


#define cond_resched() ({			\
#define cond_resched() ({			\
	___might_sleep(__FILE__, __LINE__, 0);	\
	___might_sleep(__FILE__, __LINE__, 0);	\
@@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
#endif
#endif
}
}


static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
	return p->preempt_disable_ip;
#else
	return 0;
#endif
}

/*
/*
 * Does a critical section need to be broken due to another
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPT,
 * task waiting?: (technically does not depend on CONFIG_PREEMPT,
Loading