Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2febb53a authored by Satya Durga Srinivasu Prabhala's avatar Satya Durga Srinivasu Prabhala
Browse files

sched/fair: Add snapshot of load-balancing changes



This snapshot is taken from msm-4.14 as of commit 871eac76e6be567
(Merge "msm: pcie: provide option to override maximum GEN speed").

Change-Id: I75555a04020478e8d589b6a0fba209501c360dad
Signed-off-by: default avatarSatya Durga Srinivasu Prabhala <satyap@codeaurora.org>
parent 0fb3d191
Loading
Loading
Loading
Loading
+182 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
#include <linux/sched/idle.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
@@ -208,6 +209,187 @@ TRACE_EVENT(sched_migrate_task,
		  __entry->orig_cpu, __entry->dest_cpu)
);

/*
 * Tracepoint for load balancing:
 */
#ifdef CONFIG_SMP
#if NR_CPUS > 32
#error "Unsupported NR_CPUS for lb tracepoint."
#endif
TRACE_EVENT(sched_load_balance,

	TP_PROTO(int cpu, enum cpu_idle_type idle, int balance,
		unsigned long group_mask, int busiest_nr_running,
		unsigned long imbalance, unsigned int env_flags, int ld_moved,
		unsigned int balance_interval, int active_balance),

	TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running,
		imbalance, env_flags, ld_moved, balance_interval,
		active_balance),

	TP_STRUCT__entry(
		__field(int,                    cpu)
		__field(enum cpu_idle_type,     idle)
		__field(int,                    balance)
		__field(unsigned long,          group_mask)
		__field(int,                    busiest_nr_running)
		__field(unsigned long,          imbalance)
		__field(unsigned int,           env_flags)
		__field(int,                    ld_moved)
		__field(unsigned int,           balance_interval)
		__field(int,                    active_balance)
	),

	TP_fast_assign(
		__entry->cpu                    = cpu;
		__entry->idle                   = idle;
		__entry->balance                = balance;
		__entry->group_mask             = group_mask;
		__entry->busiest_nr_running     = busiest_nr_running;
		__entry->imbalance              = imbalance;
		__entry->env_flags              = env_flags;
		__entry->ld_moved               = ld_moved;
		__entry->balance_interval       = balance_interval;
		__entry->active_balance		= active_balance;
	),

	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d",
		__entry->cpu,
		__entry->idle == CPU_IDLE ? "idle" :
		(__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"),
		__entry->balance,
		__entry->group_mask, __entry->busiest_nr_running,
		__entry->imbalance, __entry->env_flags, __entry->ld_moved,
		__entry->balance_interval, __entry->active_balance)
);

TRACE_EVENT(sched_load_balance_nohz_kick,

	TP_PROTO(int cpu, int kick_cpu),

	TP_ARGS(cpu, kick_cpu),

	TP_STRUCT__entry(
		__field(int,		cpu)
		__field(unsigned int,	cpu_nr)
		__field(unsigned long,	misfit_task_load)
		__field(int,		cpu_overutil)
		__field(int,		kick_cpu)
		__field(unsigned long,	nohz_flags)
	),

	TP_fast_assign(
		__entry->cpu	          = cpu;
		__entry->cpu_nr		  = cpu_rq(cpu)->nr_running;
		__entry->misfit_task_load = cpu_rq(cpu)->misfit_task_load;
		__entry->cpu_overutil	  = cpu_overutilized(cpu);
		__entry->kick_cpu	  = kick_cpu;
		__entry->nohz_flags	  = atomic_read(nohz_flags(kick_cpu));
	),

	TP_printk("cpu=%d nr_run=%u misfit_task_load=%lu overutilized=%d kick_cpu=%d nohz_flags=0x%lx",
			__entry->cpu, __entry->cpu_nr,
			__entry->misfit_task_load, __entry->cpu_overutil,
			__entry->kick_cpu, __entry->nohz_flags)

);

TRACE_EVENT(sched_load_balance_sg_stats,

	TP_PROTO(unsigned long sg_cpus, int group_type, unsigned int idle_cpus,
		unsigned int sum_nr_running, unsigned long group_load,
		unsigned long group_capacity, unsigned long group_util,
		int group_no_capacity, unsigned long load_per_task,
		unsigned long misfit_load, unsigned long busiest),

	TP_ARGS(sg_cpus, group_type, idle_cpus, sum_nr_running, group_load,
		group_capacity, group_util, group_no_capacity, load_per_task,
		misfit_load, busiest),

	TP_STRUCT__entry(
		__field(unsigned long,		group_mask)
		__field(int,			group_type)
		__field(unsigned int,		group_idle_cpus)
		__field(unsigned int,		sum_nr_running)
		__field(unsigned long,		group_load)
		__field(unsigned long,		group_capacity)
		__field(unsigned long,		group_util)
		__field(int,			group_no_capacity)
		__field(unsigned long,		load_per_task)
		__field(unsigned long,		misfit_task_load)
		__field(unsigned long,		busiest)
	),

	TP_fast_assign(
		__entry->group_mask			= sg_cpus;
		__entry->group_type			= group_type;
		__entry->group_idle_cpus		= idle_cpus;
		__entry->sum_nr_running			= sum_nr_running;
		__entry->group_load			= group_load;
		__entry->group_capacity			= group_capacity;
		__entry->group_util			= group_util;
		__entry->group_no_capacity		= group_no_capacity;
		__entry->load_per_task			= load_per_task;
		__entry->misfit_task_load		= misfit_load;
		__entry->busiest			= busiest;
	),

	TP_printk("sched_group=%#lx type=%d idle_cpus=%u sum_nr_run=%u group_load=%lu capacity=%lu util=%lu no_capacity=%d lpt=%lu misfit_tload=%lu busiest_group=%#lx",
		__entry->group_mask, __entry->group_type,
		__entry->group_idle_cpus, __entry->sum_nr_running,
		__entry->group_load, __entry->group_capacity,
		__entry->group_util, __entry->group_no_capacity,
		__entry->load_per_task, __entry->misfit_task_load,
		__entry->busiest)
);

TRACE_EVENT(sched_load_balance_stats,

	TP_PROTO(unsigned long busiest, int bgroup_type,
		unsigned long bavg_load, unsigned long bload_per_task,
		unsigned long local, int lgroup_type, unsigned long lavg_load,
		unsigned long lload_per_task, unsigned long sds_avg_load,
		unsigned long imbalance),

	TP_ARGS(busiest, bgroup_type, bavg_load, bload_per_task, local,
		lgroup_type, lavg_load, lload_per_task, sds_avg_load,
		imbalance),

	TP_STRUCT__entry(
		__field(unsigned long,		busiest)
		__field(int,			bgp_type)
		__field(unsigned long,		bavg_load)
		__field(unsigned long,		blpt)
		__field(unsigned long,		local)
		__field(int,			lgp_type)
		__field(unsigned long,		lavg_load)
		__field(unsigned long,		llpt)
		__field(unsigned long,		sds_avg)
		__field(unsigned long,		imbalance)
	),

	TP_fast_assign(
		__entry->busiest			= busiest;
		__entry->bgp_type			= bgroup_type;
		__entry->bavg_load			= bavg_load;
		__entry->blpt				= bload_per_task;
		__entry->bgp_type			= bgroup_type;
		__entry->local				= local;
		__entry->lgp_type			= lgroup_type;
		__entry->lavg_load			= lavg_load;
		__entry->llpt				= lload_per_task;
		__entry->sds_avg			= sds_avg_load;
		__entry->imbalance			= imbalance;
	),

	TP_printk("busiest_group=%#lx busiest_type=%d busiest_avg_load=%ld busiest_lpt=%ld local_group=%#lx local_type=%d local_avg_load=%ld local_lpt=%ld domain_avg_load=%ld imbalance=%ld",
		__entry->busiest, __entry->bgp_type, __entry->bavg_load,
		__entry->blpt, __entry->local, __entry->lgp_type,
		__entry->lavg_load, __entry->llpt, __entry->sds_avg,
		__entry->imbalance)
);
#endif

DECLARE_EVENT_CLASS(sched_process_template,

	TP_PROTO(struct task_struct *p),
+253 −23
Original line number Diff line number Diff line
@@ -5203,9 +5203,15 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static unsigned long capacity_of(int cpu);

static inline bool cpu_overutilized(int cpu)
bool __cpu_overutilized(int cpu, int delta)
{
	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
	return (capacity_orig_of(cpu) * 1024) <
		((cpu_util(cpu) + delta) * sched_capacity_margin_up[cpu]);
}

bool cpu_overutilized(int cpu)
{
	return __cpu_overutilized(cpu, 0);
}

static inline void update_overutilized_status(struct rq *rq)
@@ -7817,6 +7823,8 @@ enum group_type {
#define LBF_SOME_PINNED	0x08
#define LBF_NOHZ_STATS	0x10
#define LBF_NOHZ_AGAIN	0x20
#define LBF_IGNORE_BIG_TASKS 0x100
#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200

struct lb_env {
	struct sched_domain	*sd;
@@ -7992,6 +8000,38 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
	/* Record that we found atleast one task that could run on dst_cpu */
	env->flags &= ~LBF_ALL_PINNED;

	if (static_branch_unlikely(&sched_energy_present)) {
		struct root_domain *rd = env->dst_rq->rd;

		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) &&
						env->idle == CPU_NEWLY_IDLE) {
			long util_cum_dst, util_cum_src;
			unsigned long demand;

			demand = task_util(p);
			util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;
			util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;

			if (util_cum_dst > util_cum_src)
				return 0;
		}
	}

#ifdef CONFIG_SCHED_WALT
	if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
			 !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
		return 0;

	/* Don't detach task if it doesn't fit on the destination */
	if (env->flags & LBF_IGNORE_BIG_TASKS &&
		!task_fits_max(p, env->dst_cpu))
		return 0;
#endif

	/* Don't detach task if it is under active migration */
	if (env->src_rq->push_task == p)
		return 0;

	if (task_running(env->src_rq, p)) {
		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
		return 0;
@@ -7999,15 +8039,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)

	/*
	 * Aggressive migration if:
	 * 1) destination numa is preferred
	 * 2) task is cache cold, or
	 * 3) too many balance attempts have failed.
	 * 1) IDLE or NEWLY_IDLE balance.
	 * 2) destination numa is preferred
	 * 3) task is cache cold, or
	 * 4) too many balance attempts have failed.
	 */
	tsk_cache_hot = migrate_degrades_locality(p, env);
	if (tsk_cache_hot == -1)
		tsk_cache_hot = task_hot(p, env);

	if (tsk_cache_hot <= 0 ||
	if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
		if (tsk_cache_hot == 1) {
			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
@@ -8079,14 +8120,22 @@ static int detach_tasks(struct lb_env *env)
{
	struct list_head *tasks = &env->src_rq->cfs_tasks;
	struct task_struct *p;
	unsigned long load;
	unsigned long load = 0;
	int detached = 0;
	int orig_loop = env->loop;

	lockdep_assert_held(&env->src_rq->lock);

	if (env->imbalance <= 0)
		return 0;

	if (!same_cluster(env->dst_cpu, env->src_cpu))
		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;

	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
		env->flags |= LBF_IGNORE_BIG_TASKS;

redo:
	while (!list_empty(tasks)) {
		/*
		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -8145,9 +8194,23 @@ static int detach_tasks(struct lb_env *env)

		continue;
next:
#ifdef CONFIG_SCHED_WALT
		trace_sched_load_balance_skip_tasks(env->src_cpu, env->dst_cpu,
				env->src_grp_type, p->pid, load, task_util(p),
				cpumask_bits(&p->cpus_allowed)[0]);
#endif
		list_move(&p->se.group_node, tasks);
	}

	if (env->flags & (LBF_IGNORE_BIG_TASKS |
			LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
		tasks = &env->src_rq->cfs_tasks;
		env->flags &= ~(LBF_IGNORE_BIG_TASKS |
				LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
		env->loop = orig_loop;
		goto redo;
	}

	/*
	 * Right now, this is one of only two places we collect this stat
	 * so we can safely collect detach_one_task() stats here rather
@@ -8515,11 +8578,12 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
	int max_cap_cpu;
	unsigned long flags;

	cpu_rq(cpu)->cpu_capacity_orig = capacity;

	capacity *= arch_scale_max_freq_capacity(sd, cpu);
	capacity >>= SCHED_CAPACITY_SHIFT;

	capacity = min(capacity, thermal_cap(cpu));
	cpu_rq(cpu)->cpu_capacity_orig = capacity;

	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;

	raw_spin_lock_irqsave(&mcc->lock, flags);
@@ -8532,7 +8596,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
		mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
		raw_spin_unlock_irqrestore(&mcc->lock, flags);
		pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
		printk_deferred("CPU%d: update max cpu_capacity %lu\n",
							cpu, capacity);
		goto skip_unlock;
#endif
	}
@@ -9052,6 +9117,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
		sds->total_load += sgs->group_load;
		sds->total_capacity += sgs->group_capacity;

		trace_sched_load_balance_sg_stats(sg->cpumask[0],
				sgs->group_type, sgs->idle_cpus,
				sgs->sum_nr_running, sgs->group_load,
				sgs->group_capacity, sgs->group_util,
				sgs->group_no_capacity,	sgs->load_per_task,
				sgs->group_misfit_task_load,
				sds->busiest ? sds->busiest->cpumask[0] : 0);

		sg = sg->next;
	} while (sg != env->sd->groups);

@@ -9250,6 +9323,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
	    (busiest->avg_load <= sds->avg_load ||
	     local->avg_load >= sds->avg_load)) {
		env->imbalance = 0;
		if (busiest->group_type == group_overloaded &&
				local->group_type <= group_misfit_task) {
			env->imbalance = busiest->load_per_task;
			return;
		}
		return fix_small_imbalance(env, sds);
	}

@@ -9304,9 +9382,24 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
	 * a think about bumping its value to force at least one task to be
	 * moved
	 */
	if (env->imbalance < busiest->load_per_task)
	if (env->imbalance < busiest->load_per_task) {
		/*
		 * The busiest group is overloaded so it could use help
		 * from the other groups. If the local group has idle CPUs
		 * and it is not overloaded and has no imbalance with in
		 * the group, allow the load balance by bumping the
		 * imbalance.
		 */
		if (busiest->group_type == group_overloaded &&
			local->group_type <= group_misfit_task &&
			env->idle != CPU_NOT_IDLE) {
			env->imbalance = busiest->load_per_task;
			return;
		}

		return fix_small_imbalance(env, sds);
	}
}

/******* find_busiest_group() helpers end here *********************/

@@ -9337,9 +9430,35 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
	if (static_branch_unlikely(&sched_energy_present)) {
		struct root_domain *rd = env->dst_rq->rd;

		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) {
			int cpu_local, cpu_busiest;
			long util_cum;
			unsigned long capacity_local, capacity_busiest;

			if (env->idle != CPU_NEWLY_IDLE)
				goto out_balanced;

			if (!sds.local || !sds.busiest)
				goto out_balanced;

			cpu_local = group_first_cpu(sds.local);
			cpu_busiest = group_first_cpu(sds.busiest);

			/* TODO:don't assume same cap cpus are in same domain */
			capacity_local = capacity_orig_of(cpu_local);
			capacity_busiest = capacity_orig_of(cpu_busiest);
			if (capacity_local > capacity_busiest) {
				goto out_balanced;
			} else if (capacity_local == capacity_busiest) {
				if (cpu_rq(cpu_busiest)->nr_running < 2)
					goto out_balanced;

				util_cum = cpu_util_cum(cpu_busiest, 0);
				if (util_cum < cpu_util_cum(cpu_local, 0))
					goto out_balanced;
			}
		}
	}

	local = &sds.local_stat;
	busiest = &sds.busiest_stat;
@@ -9415,6 +9534,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
	/* Looks like there is an imbalance. Compute it */
	env->src_grp_type = busiest->group_type;
	calculate_imbalance(env, &sds);
	trace_sched_load_balance_stats(sds.busiest->cpumask[0],
				busiest->group_type, busiest->avg_load,
				busiest->load_per_task,	sds.local->cpumask[0],
				local->group_type, local->avg_load,
				local->load_per_task,
				sds.avg_load, env->imbalance);
	return env->imbalance ? sds.busiest : NULL;

out_balanced:
@@ -9524,6 +9649,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 * so long as it is large enough.
 */
#define MAX_PINNED_INTERVAL	512
#define NEED_ACTIVE_BALANCE_THRESHOLD 10

static int need_active_balance(struct lb_env *env)
{
@@ -9554,10 +9680,14 @@ static int need_active_balance(struct lb_env *env)
			return 1;
	}

	if (env->src_grp_type == group_misfit_task)
	if (env->idle != CPU_NOT_IDLE &&
			env->src_grp_type == group_misfit_task)
		return 1;

	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
	if ((env->idle != CPU_NOT_IDLE) &&
		(capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
		((capacity_orig_of(env->src_cpu) <
				capacity_orig_of(env->dst_cpu))) &&
				env->src_rq->cfs.h_nr_running == 1 &&
				cpu_overutilized(env->src_cpu) &&
				!cpu_overutilized(env->dst_cpu)) {
@@ -9628,10 +9758,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *continue_balancing)
{
	int ld_moved, cur_ld_moved, active_balance = 0;
	int ld_moved = 0, cur_ld_moved, active_balance = 0;
	struct sched_domain *sd_parent = sd->parent;
	struct sched_group *group;
	struct rq *busiest;
	struct sched_group *group = NULL;
	struct rq *busiest = NULL;
	struct rq_flags rf;
	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

@@ -9645,6 +9775,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		.cpus		= cpus,
		.fbq_type	= all,
		.tasks		= LIST_HEAD_INIT(env.tasks),
		.imbalance	= 0,
		.flags		= 0,
		.loop		= 0,
	};

	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
@@ -9689,6 +9822,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,

more_balance:
		rq_lock_irqsave(busiest, &rf);

		/* The world might have changed. Validate assumptions */
		if (busiest->nr_running <= 1) {
			rq_unlock_irqrestore(busiest, &rf);
			env.flags &= ~LBF_ALL_PINNED;
			goto no_move;
		}

		update_rq_clock(busiest);

		/*
@@ -9786,23 +9927,37 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		}
	}

no_move:
	if (!ld_moved) {
		schedstat_inc(sd->lb_failed[idle]);
		/*
		 * Increment the failure counter only on periodic balance.
		 * We do not want newidle balance, which can be very
		 * frequent, pollute the failure counter causing
		 * excessive cache_hot migrations and active balances.
		 */
		if (idle != CPU_NEWLY_IDLE)
		if (idle != CPU_NEWLY_IDLE) {
			if (env.src_grp_nr_running > 1)
				sd->nr_balance_failed++;
		}

		if (need_active_balance(&env)) {
			unsigned long flags;

			raw_spin_lock_irqsave(&busiest->lock, flags);

			/*
			 * The CPUs are marked as reserved if tasks
			 * are pushed/pulled from other CPUs. In that case,
			 * bail out from the load balancer.
			 */
			if (is_reserved(this_cpu) ||
					is_reserved(cpu_of(busiest))) {
				raw_spin_unlock_irqrestore(&busiest->lock,
								flags);
				*continue_balancing = 0;
				goto out;
			}

			/*
			 * Don't kick the active_load_balance_cpu_stop,
			 * if the curr task on busiest CPU can't be
@@ -9832,10 +9987,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
				stop_one_cpu_nowait(cpu_of(busiest),
					active_load_balance_cpu_stop, busiest,
					&busiest->active_balance_work);
				*continue_balancing = 0;
			}

			/* We've kicked active balancing, force task migration. */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
			sd->nr_balance_failed = sd->cache_nice_tries +
					NEED_ACTIVE_BALANCE_THRESHOLD - 1;
		}
	} else
		sd->nr_balance_failed = 0;
@@ -9887,6 +10044,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,

	ld_moved = 0;
out:
	trace_sched_load_balance(this_cpu, idle, *continue_balancing,
				 group ? group->cpumask[0] : 0,
				 busiest ? busiest->nr_running : 0,
				 env.imbalance, env.flags, ld_moved,
				 sd->balance_interval, active_balance);
	return ld_moved;
}

@@ -9930,9 +10092,22 @@ static int active_load_balance_cpu_stop(void *data)
	int busiest_cpu = cpu_of(busiest_rq);
	int target_cpu = busiest_rq->push_cpu;
	struct rq *target_rq = cpu_rq(target_cpu);
	struct sched_domain *sd;
	struct sched_domain *sd = NULL;
	struct task_struct *p = NULL;
	struct rq_flags rf;
	struct task_struct *push_task;
	int push_task_detached = 0;
	struct lb_env env = {
		.sd                     = sd,
		.dst_cpu                = target_cpu,
		.dst_rq                 = target_rq,
		.src_cpu                = busiest_rq->cpu,
		.src_rq                 = busiest_rq,
		.idle                   = CPU_IDLE,
		.flags                  = 0,
		.loop                   = 0,
	};
	bool moved = false;

	rq_lock_irq(busiest_rq, &rf);
	/*
@@ -9959,6 +10134,20 @@ static int active_load_balance_cpu_stop(void *data)
	 */
	BUG_ON(busiest_rq == target_rq);

	push_task = busiest_rq->push_task;
	target_cpu = busiest_rq->push_cpu;
	if (push_task) {
		if (task_on_rq_queued(push_task) &&
			push_task->state == TASK_RUNNING &&
			task_cpu(push_task) == busiest_cpu &&
					cpu_online(target_cpu)) {
			detach_task(push_task, &env);
			push_task_detached = 1;
			moved = true;
		}
		goto out_unlock;
	}

	/* Search for an sd spanning us and the target CPU. */
	rcu_read_lock();
	for_each_domain(target_cpu, sd) {
@@ -9992,6 +10181,7 @@ static int active_load_balance_cpu_stop(void *data)
			schedstat_inc(sd->alb_pushed);
			/* Active balancing done, reset the failure counter. */
			sd->nr_balance_failed = 0;
			moved = true;
		} else {
			schedstat_inc(sd->alb_failed);
		}
@@ -9999,8 +10189,21 @@ static int active_load_balance_cpu_stop(void *data)
	rcu_read_unlock();
out_unlock:
	busiest_rq->active_balance = 0;
	push_task = busiest_rq->push_task;
	target_cpu = busiest_rq->push_cpu;

	if (push_task)
		busiest_rq->push_task = NULL;

	rq_unlock(busiest_rq, &rf);

	if (push_task) {
		if (push_task_detached)
			attach_one_task(target_rq, push_task);
		put_task_struct(push_task);
		clear_reserved(target_cpu);
	}

	if (p)
		attach_one_task(target_rq, p);

@@ -10148,7 +10351,33 @@ static inline int on_null_domain(struct rq *rq)

static inline int find_new_ilb(void)
{
	int ilb = cpumask_first(nohz.idle_cpus_mask);
	int ilb = nr_cpu_ids;
	struct sched_domain *sd;
	int cpu = raw_smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	cpumask_t cpumask;

	rcu_read_lock();
	sd = rcu_dereference_check_sched_domain(rq->sd);
	if (sd) {
		cpumask_and(&cpumask, nohz.idle_cpus_mask,
				sched_domain_span(sd));
		cpumask_andnot(&cpumask, &cpumask,
				cpu_isolated_mask);
		ilb = cpumask_first(&cpumask);
	}
	rcu_read_unlock();

	if (sd && (ilb >= nr_cpu_ids || !idle_cpu(ilb))) {
		if (!energy_aware() ||
				(capacity_orig_of(cpu) ==
				cpu_rq(cpu)->rd->max_cpu_capacity.val ||
				cpu_overutilized(cpu))) {
			cpumask_andnot(&cpumask, nohz.idle_cpus_mask,
					cpu_isolated_mask);
			ilb = cpumask_first(&cpumask);
		}
	}

	if (ilb < nr_cpu_ids && idle_cpu(ilb))
		return ilb;
@@ -10182,6 +10411,7 @@ static void kick_ilb(unsigned int flags)
	 * is idle. And the softirq performing nohz idle load balance
	 * will be run before returning from the IPI.
	 */
	trace_sched_load_balance_nohz_kick(smp_processor_id(), ilb_cpu);
	smp_send_reschedule(ilb_cpu);
}

+3 −0
Original line number Diff line number Diff line
@@ -1755,6 +1755,9 @@ extern void trigger_load_balance(struct rq *rq);

extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);

bool __cpu_overutilized(int cpu, int delta);
bool cpu_overutilized(int cpu);

#endif

#ifdef CONFIG_CPU_IDLE