sched: fair: Add load-balancing changes snapshot (1c7ab10e) · Commits · e / devices / android_kernel_xiaomi_sm6125

include/trace/events/sched.h

+51 −0

Original line number	Diff line number	Diff line
		@@ -8,6 +8,7 @@
		#include <linux/sched/numa_balancing.h>
		#include <linux/tracepoint.h>
		#include <linux/binfmts.h>
		#include <linux/sched/idle.h>

		/*
		* Tracepoint for calling kthread_stop, performed to end a kthread:
		@@ -198,6 +199,56 @@ TRACE_EVENT(sched_migrate_task,
		__entry->orig_cpu, __entry->dest_cpu)
		);

		/*
		* Tracepoint for load balancing:
		*/
		#if NR_CPUS > 32
		#error "Unsupported NR_CPUS for lb tracepoint."
		#endif
		TRACE_EVENT(sched_load_balance,

		TP_PROTO(int cpu, enum cpu_idle_type idle, int balance,
		unsigned long group_mask, int busiest_nr_running,
		unsigned long imbalance, unsigned int env_flags, int ld_moved,
		unsigned int balance_interval),

		TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running,
		imbalance, env_flags, ld_moved, balance_interval),

		TP_STRUCT__entry(
		__field( int, cpu)
		__field( enum cpu_idle_type, idle)
		__field( int, balance)
		__field( unsigned long, group_mask)
		__field( int, busiest_nr_running)
		__field( unsigned long, imbalance)
		__field( unsigned int, env_flags)
		__field( int, ld_moved)
		__field( unsigned int, balance_interval)
		),

		TP_fast_assign(
		__entry->cpu = cpu;
		__entry->idle = idle;
		__entry->balance = balance;
		__entry->group_mask = group_mask;
		__entry->busiest_nr_running = busiest_nr_running;
		__entry->imbalance = imbalance;
		__entry->env_flags = env_flags;
		__entry->ld_moved = ld_moved;
		__entry->balance_interval = balance_interval;
		),

		TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d",
		__entry->cpu,
		__entry->idle == CPU_IDLE ? "idle" :
		(__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"),
		__entry->balance,
		__entry->group_mask, __entry->busiest_nr_running,
		__entry->imbalance, __entry->env_flags, __entry->ld_moved,
		__entry->balance_interval)
		);

		DECLARE_EVENT_CLASS(sched_process_template,

		TP_PROTO(struct task_struct *p),

kernel/sched/fair.c

+181 −17

Original line number	Diff line number	Diff line
		@@ -7228,9 +7228,15 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
		return task_fits_capacity(p, min_cap, cpu);
		}

		static bool cpu_overutilized(int cpu)
		bool __cpu_overutilized(int cpu, unsigned long util)
		{
		return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
		return (capacity_orig_of(cpu) * 1024 <
		util * capacity_margin);
		}

		bool cpu_overutilized(int cpu)
		{
		return __cpu_overutilized(cpu, cpu_util(cpu));
		}

		DEFINE_PER_CPU(struct energy_env, eenv_cache);
		@@ -8196,6 +8202,8 @@ enum group_type {
		#define LBF_NEED_BREAK 0x02
		#define LBF_DST_PINNED 0x04
		#define LBF_SOME_PINNED 0x08
		#define LBF_IGNORE_BIG_TASKS 0x100
		#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200

		struct lb_env {
		struct sched_domain *sd;
		@@ -8367,9 +8375,33 @@ int can_migrate_task(struct task_struct p, struct lb_env env)
		return 0;
		}

		if (energy_aware() && !sd_overutilized(env->sd) &&
		env->idle == CPU_NEWLY_IDLE) {
		long util_cum_dst, util_cum_src;
		unsigned long demand;

		demand = task_util(p);
		util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;
		util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;

		if (util_cum_dst > util_cum_src)
		return 0;
		}

		/* Record that we found atleast one task that could run on dst_cpu */
		env->flags &= ~LBF_ALL_PINNED;

		#ifdef CONFIG_SCHED_WALT
		if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
		!preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
		return 0;

		/* Don't detach task if it doesn't fit on the destination */
		if (env->flags & LBF_IGNORE_BIG_TASKS) //&&
		//!task_fits_max(p, env->dst_cpu))
		return 0;
		#endif

		if (task_running(env->src_rq, p)) {
		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
		return 0;
		@@ -8377,15 +8409,16 @@ int can_migrate_task(struct task_struct p, struct lb_env env)

		/*
		* Aggressive migration if:
		* 1) destination numa is preferred
		* 2) task is cache cold, or
		* 3) too many balance attempts have failed.
		* 1) IDLE or NEWLY_IDLE balance.
		* 2) destination numa is preferred
		* 3) task is cache cold, or
		* 4) too many balance attempts have failed.
		*/
		tsk_cache_hot = migrate_degrades_locality(p, env);
		if (tsk_cache_hot == -1)
		tsk_cache_hot = task_hot(p, env);

		if (tsk_cache_hot <= 0 \|\|
		if (env->idle != CPU_NOT_IDLE \|\| tsk_cache_hot <= 0 \|\|
		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
		if (tsk_cache_hot == 1) {
		schedstat_inc(env->sd->lb_hot_gained[env->idle]);
		@@ -8456,12 +8489,20 @@ static int detach_tasks(struct lb_env *env)
		struct task_struct *p;
		unsigned long load;
		int detached = 0;
		int orig_loop = env->loop;

		lockdep_assert_held(&env->src_rq->lock);

		if (env->imbalance <= 0)
		return 0;

		if (!same_cluster(env->dst_cpu, env->src_cpu))
		env->flags \|= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;

		if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
		env->flags \|= LBF_IGNORE_BIG_TASKS;

		redo:
		while (!list_empty(tasks)) {
		/*
		* We don't want to steal all, otherwise we may be treated likewise,
		@@ -8523,6 +8564,15 @@ static int detach_tasks(struct lb_env *env)
		list_move_tail(&p->se.group_node, tasks);
		}

		if (env->flags & (LBF_IGNORE_BIG_TASKS \|
		LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
		tasks = &env->src_rq->cfs_tasks;
		env->flags &= ~(LBF_IGNORE_BIG_TASKS \|
		LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
		env->loop = orig_loop;
		goto redo;
		}

		/*
		* Right now, this is one of only two places we collect this stat
		* so we can safely collect detach_one_task() stats here rather
		@@ -8837,6 +8887,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
		unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
		struct sched_group *sdg = sd->groups;

		capacity = min(capacity, thermal_cap(cpu));

		cpu_rq(cpu)->cpu_capacity_orig = capacity;

		capacity *= scale_rt_capacity(cpu);
		@@ -9648,8 +9700,34 @@ static struct sched_group find_busiest_group(struct lb_env env)
		*/
		update_sd_lb_stats(env, &sds);

		if (energy_aware() && !sd_overutilized(env->sd))
		if (energy_aware() && !sd_overutilized(env->sd)) {
		int cpu_local, cpu_busiest;
		long util_cum;
		unsigned long capacity_local, capacity_busiest;

		if (env->idle != CPU_NEWLY_IDLE)
		goto out_balanced;

		if (!sds.local \|\| !sds.busiest)
		goto out_balanced;

		cpu_local = group_first_cpu(sds.local);
		cpu_busiest = group_first_cpu(sds.busiest);

		/* TODO: don't assume same cap cpus are in same domain */
		capacity_local = capacity_orig_of(cpu_local);
		capacity_busiest = capacity_orig_of(cpu_busiest);
		if (capacity_local > capacity_busiest) {
		goto out_balanced;
		} else if (capacity_local == capacity_busiest) {
		if (cpu_rq(cpu_busiest)->nr_running < 2)
		goto out_balanced;

		util_cum = cpu_util_cum(cpu_busiest, 0);
		if (util_cum < cpu_util_cum(cpu_local, 0))
		goto out_balanced;
		}
		}

		local = &sds.local_stat;
		busiest = &sds.busiest_stat;
		@@ -9817,6 +9895,7 @@ static struct rq find_busiest_queue(struct lb_env env,
		* so long as it is large enough.
		*/
		#define MAX_PINNED_INTERVAL 512
		#define NEED_ACTIVE_BALANCE_THRESHOLD 10

		static int need_active_balance(struct lb_env *env)
		{
		@@ -9855,7 +9934,8 @@ static int need_active_balance(struct lb_env *env)
		return 1;
		}

		return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
		return unlikely(sd->nr_balance_failed >
		sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
		}

		static int group_balance_cpu_not_isolated(struct sched_group *sg)
		@@ -9915,10 +9995,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		struct sched_domain *sd, enum cpu_idle_type idle,
		int *continue_balancing)
		{
		int ld_moved, cur_ld_moved, active_balance = 0;
		int ld_moved = 0, cur_ld_moved, active_balance = 0;
		struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
		struct sched_group *group;
		struct rq *busiest;
		struct sched_group *group = NULL;
		struct rq *busiest = NULL;
		struct rq_flags rf;
		struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

		@@ -9932,6 +10012,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		.cpus = cpus,
		.fbq_type = all,
		.tasks = LIST_HEAD_INIT(env.tasks),
		.imbalance = 0,
		.flags = 0,
		.loop = 0,
		};

		cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
		@@ -9978,6 +10061,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		rq_lock_irqsave(busiest, &rf);
		update_rq_clock(busiest);

		/* The world might have changed. Validate assumptions */
		if (busiest->nr_running <= 1) {
		rq_unlock_irqrestore(busiest, &rf);
		env.flags &= ~LBF_ALL_PINNED;
		goto no_move;
		}
		/*
		* cur_ld_moved - load moved in current iteration
		* ld_moved - cumulative load moved across iterations
		@@ -10073,17 +10162,18 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		}
		}

		no_move:
		if (!ld_moved) {
		schedstat_inc(sd->lb_failed[idle]);
		/*
		* Increment the failure counter only on periodic balance.
		* We do not want newidle balance, which can be very
		* frequent, pollute the failure counter causing
		* excessive cache_hot migrations and active balances.
		*/
		if (idle != CPU_NEWLY_IDLE)
		if (idle != CPU_NEWLY_IDLE) {
		if (env.src_grp_nr_running > 1)
		sd->nr_balance_failed++;
		}

		if (need_active_balance(&env)) {
		unsigned long flags;
		@@ -10118,10 +10208,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		stop_one_cpu_nowait(cpu_of(busiest),
		active_load_balance_cpu_stop, busiest,
		&busiest->active_balance_work);
		*continue_balancing = 0;
		}

		/* We've kicked active balancing, force task migration. */
		sd->nr_balance_failed = sd->cache_nice_tries+1;
		sd->nr_balance_failed = sd->cache_nice_tries +
		NEED_ACTIVE_BALANCE_THRESHOLD - 1;
		}
		} else
		sd->nr_balance_failed = 0;
		@@ -10173,6 +10265,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,

		ld_moved = 0;
		out:
		trace_sched_load_balance(this_cpu, idle, *continue_balancing,
		group ? group->cpumask[0] : 0,
		busiest ? busiest->nr_running : 0,
		env.imbalance, env.flags, ld_moved,
		sd->balance_interval);
		return ld_moved;
		}

		@@ -10352,9 +10449,22 @@ static int active_load_balance_cpu_stop(void *data)
		int busiest_cpu = cpu_of(busiest_rq);
		int target_cpu = busiest_rq->push_cpu;
		struct rq *target_rq = cpu_rq(target_cpu);
		struct sched_domain *sd;
		struct sched_domain *sd = NULL;
		struct task_struct *p = NULL;
		struct rq_flags rf;
		struct task_struct *push_task;
		int push_task_detached = 0;
		struct lb_env env = {
		.sd = sd,
		.dst_cpu = target_cpu,
		.dst_rq = target_rq,
		.src_cpu = busiest_rq->cpu,
		.src_rq = busiest_rq,
		.idle = CPU_IDLE,
		.flags = 0,
		.loop = 0,
		};
		bool moved = false;

		rq_lock_irq(busiest_rq, &rf);
		/*
		@@ -10381,6 +10491,20 @@ static int active_load_balance_cpu_stop(void *data)
		*/
		BUG_ON(busiest_rq == target_rq);

		push_task = busiest_rq->push_task;
		target_cpu = busiest_rq->push_cpu;
		if (push_task) {
		if (task_on_rq_queued(push_task) &&
		push_task->state == TASK_RUNNING &&
		task_cpu(push_task) == busiest_cpu &&
		cpu_online(target_cpu)) {
		detach_task(push_task, &env);
		push_task_detached = 1;
		moved = true;
		}
		goto out_unlock;
		}

		/* Search for an sd spanning us and the target CPU. */
		rcu_read_lock();
		for_each_domain(target_cpu, sd) {
		@@ -10414,6 +10538,7 @@ static int active_load_balance_cpu_stop(void *data)
		schedstat_inc(sd->alb_pushed);
		/* Active balancing done, reset the failure counter. */
		sd->nr_balance_failed = 0;
		moved = true;
		} else {
		schedstat_inc(sd->alb_failed);
		}
		@@ -10421,8 +10546,21 @@ static int active_load_balance_cpu_stop(void *data)
		rcu_read_unlock();
		out_unlock:
		busiest_rq->active_balance = 0;
		push_task = busiest_rq->push_task;
		target_cpu = busiest_rq->push_cpu;

		if (push_task)
		busiest_rq->push_task = NULL;

		rq_unlock(busiest_rq, &rf);

		if (push_task) {
		if (push_task_detached)
		attach_one_task(target_rq, push_task);
		put_task_struct(push_task);
		clear_reserved(target_cpu);
		}

		if (p)
		attach_one_task(target_rq, p);

		@@ -10446,7 +10584,33 @@ static inline int on_null_domain(struct rq *rq)

		static inline int find_new_ilb(void)
		{
		int ilb = cpumask_first(nohz.idle_cpus_mask);
		int ilb = nr_cpu_ids;
		struct sched_domain *sd;
		int cpu = raw_smp_processor_id();
		struct rq *rq = cpu_rq(cpu);
		cpumask_t cpumask;

		rcu_read_lock();
		sd = rcu_dereference_check_sched_domain(rq->sd);
		if (sd) {
		cpumask_and(&cpumask, nohz.idle_cpus_mask,
		sched_domain_span(sd));
		cpumask_andnot(&cpumask, &cpumask,
		cpu_isolated_mask);
		ilb = cpumask_first(&cpumask);
		}
		rcu_read_unlock();

		if (sd && (ilb >= nr_cpu_ids \|\| !idle_cpu(ilb))) {
		if (!energy_aware() \|\|
		(capacity_orig_of(cpu) ==
		cpu_rq(cpu)->rd->max_cpu_capacity \|\|
		cpu_overutilized(cpu))) {
		cpumask_andnot(&cpumask, nohz.idle_cpus_mask,
		cpu_isolated_mask);
		ilb = cpumask_first(&cpumask);
		}
		}

		if (ilb < nr_cpu_ids && idle_cpu(ilb))
		return ilb;