Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7125face authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched, x86: Avoid unnecessary overflow in sched_clock
  sched: Fix buglet in return_cfs_rq_runtime()
  sched: Avoid SMT siblings in select_idle_sibling() if possible
  sched: Set the command name of the idle tasks in SMP kernels
  sched, rt: Provide means of disabling cross-cpu bandwidth sharing
  sched: Document wait_for_completion_*() return values
  sched_fair: Fix a typo in the comment describing update_sd_lb_stats
  sched: Add a comment to effective_load() since it's a pain
parents 35337c83 4cecf6d4
Loading
Loading
Loading
Loading
+22 −1
Original line number Original line Diff line number Diff line
@@ -32,6 +32,22 @@ extern int no_timer_check;
 *  (mathieu.desnoyers@polymtl.ca)
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *
 *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
 *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
 *
 * In:
 *
 * ns = cycles * cyc2ns_scale / SC
 *
 * Although we may still have enough bits to store the value of ns,
 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
 * leading to an incorrect result.
 *
 * To avoid this, we can decompose 'cycles' into quotient and remainder
 * of division by SC.  Then,
 *
 * ns = (quot * SC + rem) * cyc2ns_scale / SC
 *    = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
 *
 *			- sqazi@google.com
 */
 */


DECLARE_PER_CPU(unsigned long, cyc2ns);
DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);


static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
{
	unsigned long long quot;
	unsigned long long rem;
	int cpu = smp_processor_id();
	int cpu = smp_processor_id();
	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
	ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
	quot = (cyc >> CYC2NS_SCALE_FACTOR);
	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
	ns += quot * per_cpu(cyc2ns, cpu) +
		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
	return ns;
	return ns;
}
}


+3 −1
Original line number Original line Diff line number Diff line
@@ -126,6 +126,8 @@ extern struct cred init_cred;
# define INIT_PERF_EVENTS(tsk)
# define INIT_PERF_EVENTS(tsk)
#endif
#endif


#define INIT_TASK_COMM "swapper"

/*
/*
 *  INIT_TASK is used to set up the first task table, touch at
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,7 +164,7 @@ extern struct cred init_cred;
	.group_leader	= &tsk,						\
	.group_leader	= &tsk,						\
	RCU_INIT_POINTER(.real_cred, &init_cred),			\
	RCU_INIT_POINTER(.real_cred, &init_cred),			\
	RCU_INIT_POINTER(.cred, &init_cred),				\
	RCU_INIT_POINTER(.cred, &init_cred),				\
	.comm		= "swapper",					\
	.comm		= INIT_TASK_COMM,				\
	.thread		= INIT_THREAD,					\
	.thread		= INIT_THREAD,					\
	.fs		= &init_fs,					\
	.fs		= &init_fs,					\
	.files		= &init_files,					\
	.files		= &init_files,					\
+17 −0
Original line number Original line Diff line number Diff line
@@ -71,6 +71,7 @@
#include <linux/ctype.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/slab.h>
#include <linux/init_task.h>


#include <asm/tlb.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/irq_regs.h>
@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
 * This waits for either a completion of a specific task to be signaled or for a
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. The timeout is in jiffies. It is not
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible.
 * interruptible.
 *
 * The return value is 0 if timed out, and positive (at least 1, or number of
 * jiffies left till timeout) if completed.
 */
 */
unsigned long __sched
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
 *
 *
 * This waits for completion of a specific task to be signaled. It is
 * This waits for completion of a specific task to be signaled. It is
 * interruptible.
 * interruptible.
 *
 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 */
int __sched wait_for_completion_interruptible(struct completion *x)
int __sched wait_for_completion_interruptible(struct completion *x)
{
{
@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 *
 *
 * This waits for either a completion of a specific task to be signaled or for a
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 *
 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
 * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 */
long __sched
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 *
 *
 * This waits to be signaled for completion of a specific task. It can be
 * This waits to be signaled for completion of a specific task. It can be
 * interrupted by a kill signal.
 * interrupted by a kill signal.
 *
 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 */
int __sched wait_for_completion_killable(struct completion *x)
int __sched wait_for_completion_killable(struct completion *x)
{
{
@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * This waits for either a completion of a specific task to be
 * This waits for either a completion of a specific task to be
 * signaled or for a specified timeout to expire. It can be
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
 * interrupted by a kill signal. The timeout is in jiffies.
 *
 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
 * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 */
long __sched
long __sched
wait_for_completion_killable_timeout(struct completion *x,
wait_for_completion_killable_timeout(struct completion *x,
@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
	 */
	 */
	idle->sched_class = &idle_sched_class;
	idle->sched_class = &idle_sched_class;
	ftrace_graph_init_idle_task(idle, cpu);
	ftrace_graph_init_idle_task(idle, cpu);
#if defined(CONFIG_SMP)
	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
}


/*
/*
+125 −34
Original line number Original line Diff line number Diff line
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
		list_del_leaf_cfs_rq(cfs_rq);
		list_del_leaf_cfs_rq(cfs_rq);
}
}


static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
	long tg_weight;

	/*
	 * Use this CPU's actual weight instead of the last load_contribution
	 * to gain a more accurate current total weight. See
	 * update_cfs_rq_load_contribution().
	 */
	tg_weight = atomic_read(&tg->load_weight);
	tg_weight -= cfs_rq->load_contribution;
	tg_weight += cfs_rq->load.weight;

	return tg_weight;
}

static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
{
	long load_weight, load, shares;
	long tg_weight, load, shares;


	tg_weight = calc_tg_weight(tg, cfs_rq);
	load = cfs_rq->load.weight;
	load = cfs_rq->load.weight;


	load_weight = atomic_read(&tg->load_weight);
	load_weight += load;
	load_weight -= cfs_rq->load_contribution;

	shares = (tg->shares * load);
	shares = (tg->shares * load);
	if (load_weight)
	if (tg_weight)
		shares /= load_weight;
		shares /= tg_weight;


	if (shares < MIN_SHARES)
	if (shares < MIN_SHARES)
		shares = MIN_SHARES;
		shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)


static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
{
	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
		return;
		return;


	__return_cfs_rq_runtime(cfs_rq);
	__return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
 * can calculate the shift in shares.
 *
 * Calculate the effective load difference if @wl is added (subtracted) to @tg
 * on this @cpu and results in a total addition (subtraction) of @wg to the
 * total group weight.
 *
 * Given a runqueue weight distribution (rw_i) we can compute a shares
 * distribution (s_i) using:
 *
 *   s_i = rw_i / \Sum rw_j						(1)
 *
 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
 * shares distribution (s_i):
 *
 *   rw_i = {   2,   4,   1,   0 }
 *   s_i  = { 2/7, 4/7, 1/7,   0 }
 *
 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
 * task used to run on and the CPU the waker is running on), we need to
 * compute the effect of waking a task on either CPU and, in case of a sync
 * wakeup, compute the effect of the current task going to sleep.
 *
 * So for a change of @wl to the local @cpu with an overall group weight change
 * of @wl we can compute the new shares distribution (s'_i) using:
 *
 *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
 *
 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
 * differences in waking a task to CPU 0. The additional task changes the
 * weight and shares distributions like:
 *
 *   rw'_i = {   3,   4,   1,   0 }
 *   s'_i  = { 3/8, 4/8, 1/8,   0 }
 *
 * We can then compute the difference in effective weight by using:
 *
 *   dw_i = S * (s'_i - s_i)						(3)
 *
 * Where 'S' is the group weight as seen by its parent.
 *
 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
 * 4/7) times the weight of the group.
 */
 */
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
{
	struct sched_entity *se = tg->se[cpu];
	struct sched_entity *se = tg->se[cpu];


	if (!tg->parent)
	if (!tg->parent)	/* the trivial, non-cgroup case */
		return wl;
		return wl;


	for_each_sched_entity(se) {
	for_each_sched_entity(se) {
		long lw, w;
		long w, W;


		tg = se->my_q->tg;
		tg = se->my_q->tg;
		w = se->my_q->load.weight;


		/* use this cpu's instantaneous contribution */
		/*
		lw = atomic_read(&tg->load_weight);
		 * W = @wg + \Sum rw_j
		lw -= se->my_q->load_contribution;
		 */
		lw += w + wg;
		W = wg + calc_tg_weight(tg, se->my_q);


		wl += w;
		/*
		 * w = rw_i + @wl
		 */
		w = se->my_q->load.weight + wl;


		if (lw > 0 && wl < lw)
		/*
			wl = (wl * tg->shares) / lw;
		 * wl = S * s'_i; see (2)
		 */
		if (W > 0 && w < W)
			wl = (w * tg->shares) / W;
		else
		else
			wl = tg->shares;
			wl = tg->shares;


		/* zero point is MIN_SHARES */
		/*
		 * Per the above, wl is the new se->load.weight value; since
		 * those are clipped to [MIN_SHARES, ...) do so now. See
		 * calc_cfs_shares().
		 */
		if (wl < MIN_SHARES)
		if (wl < MIN_SHARES)
			wl = MIN_SHARES;
			wl = MIN_SHARES;

		/*
		 * wl = dw_i = S * (s'_i - s_i); see (3)
		 */
		wl -= se->load.weight;
		wl -= se->load.weight;

		/*
		 * Recursively apply this logic to all parent groups to compute
		 * the final effective load change on the root group. Since
		 * only the @tg group gets extra weight, all parent groups can
		 * only redistribute existing shares. @wl is the shift in shares
		 * resulting from this level per the above.
		 */
		wg = 0;
		wg = 0;
	}
	}


@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
	int cpu = smp_processor_id();
	int cpu = smp_processor_id();
	int prev_cpu = task_cpu(p);
	int prev_cpu = task_cpu(p);
	struct sched_domain *sd;
	struct sched_domain *sd;
	int i;
	struct sched_group *sg;
	int i, smt = 0;


	/*
	/*
	 * If the task is going to be woken-up on this cpu and if it is
	 * If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
	 * Otherwise, iterate the domains and find an elegible idle cpu.
	 * Otherwise, iterate the domains and find an elegible idle cpu.
	 */
	 */
	rcu_read_lock();
	rcu_read_lock();
again:
	for_each_domain(target, sd) {
	for_each_domain(target, sd) {
		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
		if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
			break;
			continue;


		for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
		if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
			if (idle_cpu(i)) {
			if (!smt) {
				target = i;
				smt = 1;
				goto again;
			}
			break;
			break;
		}
		}

		sg = sd->groups;
		do {
			if (!cpumask_intersects(sched_group_cpus(sg),
						tsk_cpus_allowed(p)))
				goto next;

			for_each_cpu(i, sched_group_cpus(sg)) {
				if (!idle_cpu(i))
					goto next;
			}
			}


		/*
			target = cpumask_first_and(sched_group_cpus(sg),
		 * Lets stop looking for an idle sibling when we reached
					tsk_cpus_allowed(p));
		 * the domain that spans the current cpu and prev_cpu.
			goto done;
		 */
next:
		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
			sg = sg->next;
		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
		} while (sg != sd->groups);
			break;
	}
	}
done:
	rcu_read_unlock();
	rcu_read_unlock();


	return target;
	return target;
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
}
}


/**
/**
 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
 * @idle: Idle status of this_cpu
+1 −0
Original line number Original line Diff line number Diff line
@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
SCHED_FEAT(TTWU_QUEUE, 1)
SCHED_FEAT(TTWU_QUEUE, 1)


SCHED_FEAT(FORCE_SD_OVERLAP, 0)
SCHED_FEAT(FORCE_SD_OVERLAP, 0)
SCHED_FEAT(RT_RUNTIME_SHARE, 1)
Loading