Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1c7ab10e authored by Vikram Mulukutla's avatar Vikram Mulukutla Committed by Gerrit - the friendly Code Review server
Browse files

sched: fair: Add load-balancing changes snapshot



This snapshot is taken from msm-4.9 as of commit 935c3e96d14c14d
(Revert "sched/fair: Limit sync wakeup bias to waker cpu").

Change-Id: I678c66d6fa4ac96c74892328d181aa334a5e2fa7
Signed-off-by: default avatarVikram Mulukutla <markivx@codeaurora.org>
[satyap@codeaurora.org:
    1. Resolve merge conflicts
    2. Fix indentation issues
    3. Fix compilation issues for ARCH=um]
Signed-off-by: default avatarSatya Durga Srinivasu Prabhala <satyap@codeaurora.org>
parent f7fac0f3
Loading
Loading
Loading
Loading
+51 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
#include <linux/sched/idle.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
@@ -198,6 +199,56 @@ TRACE_EVENT(sched_migrate_task,
		  __entry->orig_cpu, __entry->dest_cpu)
);

/*
 * Tracepoint for load balancing:
 */
#if NR_CPUS > 32
#error "Unsupported NR_CPUS for lb tracepoint."
#endif
TRACE_EVENT(sched_load_balance,

	TP_PROTO(int cpu, enum cpu_idle_type idle, int balance,
		unsigned long group_mask, int busiest_nr_running,
		unsigned long imbalance, unsigned int env_flags, int ld_moved,
		unsigned int balance_interval),

	TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running,
		imbalance, env_flags, ld_moved, balance_interval),

	TP_STRUCT__entry(
		__field(        int,                    cpu)
		__field(        enum cpu_idle_type,     idle)
		__field(        int,                    balance)
		__field(        unsigned long,          group_mask)
		__field(        int,                    busiest_nr_running)
		__field(        unsigned long,          imbalance)
		__field(        unsigned int,           env_flags)
		__field(        int,                    ld_moved)
		__field(        unsigned int,           balance_interval)
	),

	TP_fast_assign(
		__entry->cpu                    = cpu;
		__entry->idle                   = idle;
		__entry->balance                = balance;
		__entry->group_mask             = group_mask;
		__entry->busiest_nr_running     = busiest_nr_running;
		__entry->imbalance              = imbalance;
		__entry->env_flags              = env_flags;
		__entry->ld_moved               = ld_moved;
		__entry->balance_interval       = balance_interval;
	),

	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d",
		__entry->cpu,
		__entry->idle == CPU_IDLE ? "idle" :
		(__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"),
		__entry->balance,
		__entry->group_mask, __entry->busiest_nr_running,
		__entry->imbalance, __entry->env_flags, __entry->ld_moved,
		__entry->balance_interval)
);

DECLARE_EVENT_CLASS(sched_process_template,

	TP_PROTO(struct task_struct *p),
+181 −17
Original line number Diff line number Diff line
@@ -7228,9 +7228,15 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
	return task_fits_capacity(p, min_cap, cpu);
}

static bool cpu_overutilized(int cpu)
bool __cpu_overutilized(int cpu, unsigned long util)
{
	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
	return (capacity_orig_of(cpu) * 1024 <
		util * capacity_margin);
}

bool cpu_overutilized(int cpu)
{
	return __cpu_overutilized(cpu, cpu_util(cpu));
}

DEFINE_PER_CPU(struct energy_env, eenv_cache);
@@ -8196,6 +8202,8 @@ enum group_type {
#define LBF_NEED_BREAK	0x02
#define LBF_DST_PINNED  0x04
#define LBF_SOME_PINNED	0x08
#define LBF_IGNORE_BIG_TASKS 0x100
#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200

struct lb_env {
	struct sched_domain	*sd;
@@ -8367,9 +8375,33 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
		return 0;
	}

	if (energy_aware() && !sd_overutilized(env->sd) &&
	    env->idle == CPU_NEWLY_IDLE) {
		long util_cum_dst, util_cum_src;
		unsigned long demand;

		demand = task_util(p);
		util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;
		util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;

		if (util_cum_dst > util_cum_src)
			return 0;
	}

	/* Record that we found atleast one task that could run on dst_cpu */
	env->flags &= ~LBF_ALL_PINNED;

#ifdef CONFIG_SCHED_WALT
	if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
			 !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
		return 0;

	/* Don't detach task if it doesn't fit on the destination */
	if (env->flags & LBF_IGNORE_BIG_TASKS) //&&
		//!task_fits_max(p, env->dst_cpu))
		return 0;
#endif

	if (task_running(env->src_rq, p)) {
		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
		return 0;
@@ -8377,15 +8409,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)

	/*
	 * Aggressive migration if:
	 * 1) destination numa is preferred
	 * 2) task is cache cold, or
	 * 3) too many balance attempts have failed.
	 * 1) IDLE or NEWLY_IDLE balance.
	 * 2) destination numa is preferred
	 * 3) task is cache cold, or
	 * 4) too many balance attempts have failed.
	 */
	tsk_cache_hot = migrate_degrades_locality(p, env);
	if (tsk_cache_hot == -1)
		tsk_cache_hot = task_hot(p, env);

	if (tsk_cache_hot <= 0 ||
	if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
		if (tsk_cache_hot == 1) {
			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
@@ -8456,12 +8489,20 @@ static int detach_tasks(struct lb_env *env)
	struct task_struct *p;
	unsigned long load;
	int detached = 0;
	int orig_loop = env->loop;

	lockdep_assert_held(&env->src_rq->lock);

	if (env->imbalance <= 0)
		return 0;

	if (!same_cluster(env->dst_cpu, env->src_cpu))
		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;

	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
		env->flags |= LBF_IGNORE_BIG_TASKS;

redo:
	while (!list_empty(tasks)) {
		/*
		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -8523,6 +8564,15 @@ static int detach_tasks(struct lb_env *env)
		list_move_tail(&p->se.group_node, tasks);
	}

	if (env->flags & (LBF_IGNORE_BIG_TASKS |
			LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
		tasks = &env->src_rq->cfs_tasks;
		env->flags &= ~(LBF_IGNORE_BIG_TASKS |
				LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
		env->loop = orig_loop;
		goto redo;
	}

	/*
	 * Right now, this is one of only two places we collect this stat
	 * so we can safely collect detach_one_task() stats here rather
@@ -8837,6 +8887,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
	struct sched_group *sdg = sd->groups;

	capacity = min(capacity, thermal_cap(cpu));

	cpu_rq(cpu)->cpu_capacity_orig = capacity;

	capacity *= scale_rt_capacity(cpu);
@@ -9648,8 +9700,34 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
	 */
	update_sd_lb_stats(env, &sds);

	if (energy_aware() && !sd_overutilized(env->sd))
	if (energy_aware() && !sd_overutilized(env->sd)) {
		int cpu_local, cpu_busiest;
		long util_cum;
		unsigned long capacity_local, capacity_busiest;

		if (env->idle != CPU_NEWLY_IDLE)
			goto out_balanced;

		if (!sds.local || !sds.busiest)
			goto out_balanced;

		cpu_local = group_first_cpu(sds.local);
		cpu_busiest = group_first_cpu(sds.busiest);

		/* TODO: don't assume same cap cpus are in same domain */
		capacity_local = capacity_orig_of(cpu_local);
		capacity_busiest = capacity_orig_of(cpu_busiest);
		if (capacity_local > capacity_busiest) {
			goto out_balanced;
		} else if (capacity_local == capacity_busiest) {
			if (cpu_rq(cpu_busiest)->nr_running < 2)
				goto out_balanced;

			util_cum = cpu_util_cum(cpu_busiest, 0);
			if (util_cum < cpu_util_cum(cpu_local, 0))
				goto out_balanced;
		}
	}

	local = &sds.local_stat;
	busiest = &sds.busiest_stat;
@@ -9817,6 +9895,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 * so long as it is large enough.
 */
#define MAX_PINNED_INTERVAL	512
#define NEED_ACTIVE_BALANCE_THRESHOLD 10

static int need_active_balance(struct lb_env *env)
{
@@ -9855,7 +9934,8 @@ static int need_active_balance(struct lb_env *env)
			return 1;
	}

	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
	return unlikely(sd->nr_balance_failed >
			sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}

static int group_balance_cpu_not_isolated(struct sched_group *sg)
@@ -9915,10 +9995,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *continue_balancing)
{
	int ld_moved, cur_ld_moved, active_balance = 0;
	int ld_moved = 0, cur_ld_moved, active_balance = 0;
	struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
	struct sched_group *group;
	struct rq *busiest;
	struct sched_group *group = NULL;
	struct rq *busiest = NULL;
	struct rq_flags rf;
	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

@@ -9932,6 +10012,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		.cpus		= cpus,
		.fbq_type	= all,
		.tasks		= LIST_HEAD_INIT(env.tasks),
		.imbalance		= 0,
		.flags			= 0,
		.loop			= 0,
	};

	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
@@ -9978,6 +10061,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		rq_lock_irqsave(busiest, &rf);
		update_rq_clock(busiest);

		/* The world might have changed. Validate assumptions */
		if (busiest->nr_running <= 1) {
			rq_unlock_irqrestore(busiest, &rf);
			env.flags &= ~LBF_ALL_PINNED;
			goto no_move;
		}
		/*
		 * cur_ld_moved - load moved in current iteration
		 * ld_moved     - cumulative load moved across iterations
@@ -10073,17 +10162,18 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		}
	}

no_move:
	if (!ld_moved) {
		schedstat_inc(sd->lb_failed[idle]);
		/*
		 * Increment the failure counter only on periodic balance.
		 * We do not want newidle balance, which can be very
		 * frequent, pollute the failure counter causing
		 * excessive cache_hot migrations and active balances.
		 */
		if (idle != CPU_NEWLY_IDLE)
		if (idle != CPU_NEWLY_IDLE) {
			if (env.src_grp_nr_running > 1)
				sd->nr_balance_failed++;
		}

		if (need_active_balance(&env)) {
			unsigned long flags;
@@ -10118,10 +10208,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
				stop_one_cpu_nowait(cpu_of(busiest),
					active_load_balance_cpu_stop, busiest,
					&busiest->active_balance_work);
				*continue_balancing = 0;
			}

			/* We've kicked active balancing, force task migration. */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
			sd->nr_balance_failed = sd->cache_nice_tries +
					NEED_ACTIVE_BALANCE_THRESHOLD - 1;
		}
	} else
		sd->nr_balance_failed = 0;
@@ -10173,6 +10265,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,

	ld_moved = 0;
out:
	trace_sched_load_balance(this_cpu, idle, *continue_balancing,
				 group ? group->cpumask[0] : 0,
				 busiest ? busiest->nr_running : 0,
				 env.imbalance, env.flags, ld_moved,
				 sd->balance_interval);
	return ld_moved;
}

@@ -10352,9 +10449,22 @@ static int active_load_balance_cpu_stop(void *data)
	int busiest_cpu = cpu_of(busiest_rq);
	int target_cpu = busiest_rq->push_cpu;
	struct rq *target_rq = cpu_rq(target_cpu);
	struct sched_domain *sd;
	struct sched_domain *sd = NULL;
	struct task_struct *p = NULL;
	struct rq_flags rf;
	struct task_struct *push_task;
	int push_task_detached = 0;
	struct lb_env env = {
		.sd			= sd,
		.dst_cpu		= target_cpu,
		.dst_rq			= target_rq,
		.src_cpu		= busiest_rq->cpu,
		.src_rq			= busiest_rq,
		.idle			= CPU_IDLE,
		.flags			= 0,
		.loop			= 0,
	};
	bool moved = false;

	rq_lock_irq(busiest_rq, &rf);
	/*
@@ -10381,6 +10491,20 @@ static int active_load_balance_cpu_stop(void *data)
	 */
	BUG_ON(busiest_rq == target_rq);

	push_task = busiest_rq->push_task;
	target_cpu = busiest_rq->push_cpu;
	if (push_task) {
		if (task_on_rq_queued(push_task) &&
			push_task->state == TASK_RUNNING &&
			task_cpu(push_task) == busiest_cpu &&
					cpu_online(target_cpu)) {
			detach_task(push_task, &env);
			push_task_detached = 1;
			moved = true;
		}
		goto out_unlock;
	}

	/* Search for an sd spanning us and the target CPU. */
	rcu_read_lock();
	for_each_domain(target_cpu, sd) {
@@ -10414,6 +10538,7 @@ static int active_load_balance_cpu_stop(void *data)
			schedstat_inc(sd->alb_pushed);
			/* Active balancing done, reset the failure counter. */
			sd->nr_balance_failed = 0;
			moved = true;
		} else {
			schedstat_inc(sd->alb_failed);
		}
@@ -10421,8 +10546,21 @@ static int active_load_balance_cpu_stop(void *data)
	rcu_read_unlock();
out_unlock:
	busiest_rq->active_balance = 0;
	push_task = busiest_rq->push_task;
	target_cpu = busiest_rq->push_cpu;

	if (push_task)
		busiest_rq->push_task = NULL;

	rq_unlock(busiest_rq, &rf);

	if (push_task) {
		if (push_task_detached)
			attach_one_task(target_rq, push_task);
		put_task_struct(push_task);
		clear_reserved(target_cpu);
	}

	if (p)
		attach_one_task(target_rq, p);

@@ -10446,7 +10584,33 @@ static inline int on_null_domain(struct rq *rq)

static inline int find_new_ilb(void)
{
	int ilb = cpumask_first(nohz.idle_cpus_mask);
	int ilb = nr_cpu_ids;
	struct sched_domain *sd;
	int cpu = raw_smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	cpumask_t cpumask;

	rcu_read_lock();
	sd = rcu_dereference_check_sched_domain(rq->sd);
	if (sd) {
		cpumask_and(&cpumask, nohz.idle_cpus_mask,
				sched_domain_span(sd));
		cpumask_andnot(&cpumask, &cpumask,
				cpu_isolated_mask);
		ilb = cpumask_first(&cpumask);
	}
	rcu_read_unlock();

	if (sd && (ilb >= nr_cpu_ids || !idle_cpu(ilb))) {
		if (!energy_aware() ||
				(capacity_orig_of(cpu) ==
				cpu_rq(cpu)->rd->max_cpu_capacity ||
				cpu_overutilized(cpu))) {
			cpumask_andnot(&cpumask, nohz.idle_cpus_mask,
					cpu_isolated_mask);
			ilb = cpumask_first(&cpumask);
		}
	}

	if (ilb < nr_cpu_ids && idle_cpu(ilb))
		return ilb;