Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 50a92951 authored by Lingutla Chandrasekhar's avatar Lingutla Chandrasekhar Committed by Satya Durga Srinivasu Prabhala
Browse files

sched: Add support to spread tasks



If sysctl_sched_prefer_spread is enabled, then tasks would be freely
migrated to idle cpus within same cluster to reduce runnables.

By default, the feature is disabled.
User can trigger feature with:
   echo 1 > /proc/sys/kernel/sched_prefer_spread
	Aggressively spread tasks with in little cluster.
   echo 2 > /proc/sys/kernel/sched_prefer_spread
	Aggressively spread tasks with in little cluster as well as
	big cluster, but not between big and little.

Change-Id: I0a4d87bd17de3525548765472e6f388a9970f13c
Signed-off-by: default avatarLingutla Chandrasekhar <clingutla@codeaurora.org>
[satyap@codeaurora.org: fix trivial merge conflicts]
Signed-off-by: default avatarSatya Durga Srinivasu Prabhala <satyap@codeaurora.org>
parent 2b0e26d3
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ extern unsigned int __weak sysctl_sched_window_stats_policy;
extern unsigned int __weak sysctl_sched_ravg_window_nr_ticks;
extern unsigned int __weak sysctl_sched_many_wakeup_threshold;
extern unsigned int __weak sysctl_sched_dynamic_ravg_window_enable;
extern unsigned int sysctl_sched_prefer_spread;

extern int
walt_proc_group_thresholds_handler(struct ctl_table *table, int write,
+8 −6
Original line number Diff line number Diff line
@@ -276,11 +276,11 @@ TRACE_EVENT(sched_load_balance,
		unsigned long group_mask, int busiest_nr_running,
		unsigned long imbalance, unsigned int env_flags, int ld_moved,
		unsigned int balance_interval, int active_balance,
		int overutilized),
		int overutilized, int prefer_spread),

	TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running,
		imbalance, env_flags, ld_moved, balance_interval,
		active_balance, overutilized),
		active_balance, overutilized, prefer_spread),

	TP_STRUCT__entry(
		__field(int,                    cpu)
@@ -294,6 +294,7 @@ TRACE_EVENT(sched_load_balance,
		__field(unsigned int,           balance_interval)
		__field(int,                    active_balance)
		__field(int,                    overutilized)
		__field(int,                    prefer_spread)
	),

	TP_fast_assign(
@@ -308,9 +309,10 @@ TRACE_EVENT(sched_load_balance,
		__entry->balance_interval       = balance_interval;
		__entry->active_balance		= active_balance;
		__entry->overutilized		= overutilized;
		__entry->prefer_spread		= prefer_spread;
	),

	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d",
	TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d prefer_spread=%d",
		__entry->cpu,
		__entry->idle == CPU_IDLE ? "idle" :
		(__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"),
@@ -318,7 +320,7 @@ TRACE_EVENT(sched_load_balance,
		__entry->group_mask, __entry->busiest_nr_running,
		__entry->imbalance, __entry->env_flags, __entry->ld_moved,
		__entry->balance_interval, __entry->active_balance,
		__entry->overutilized)
		__entry->overutilized, __entry->prefer_spread)
);

TRACE_EVENT(sched_load_balance_nohz_kick,
@@ -994,7 +996,7 @@ TRACE_EVENT(sched_compute_energy,
TRACE_EVENT(sched_task_util,

	TP_PROTO(struct task_struct *p, unsigned long candidates,
		int best_energy_cpu, bool sync, bool need_idle, int fastpath,
		int best_energy_cpu, bool sync, int need_idle, int fastpath,
		bool placement_boost, u64 start_t,
		bool stune_boosted, bool is_rtg, bool rtg_skip_min,
		int start_cpu),
@@ -1011,7 +1013,7 @@ TRACE_EVENT(sched_task_util,
		__field(int,            prev_cpu)
		__field(int,            best_energy_cpu)
		__field(bool,           sync)
		__field(bool,           need_idle)
		__field(int,            need_idle)
		__field(int,            fastpath)
		__field(int,            placement_boost)
		__field(int,            rtg_cpu)
+90 −21
Original line number Diff line number Diff line
@@ -131,6 +131,9 @@ unsigned int sched_capacity_margin_up[NR_CPUS] = {
unsigned int sched_capacity_margin_down[NR_CPUS] = {
			[0 ... NR_CPUS-1] = 1205}; /* ~15% margin */

#ifdef CONFIG_SCHED_WALT
__read_mostly unsigned int sysctl_sched_prefer_spread;
#endif
unsigned int sched_small_task_threshold = 102;

static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -3877,16 +3880,31 @@ static inline bool task_demand_fits(struct task_struct *p, int cpu)
}

struct find_best_target_env {
	bool is_rtg;
	int placement_boost;
	bool need_idle;
	bool boosted;
	int need_idle;
	int fastpath;
	int start_cpu;
	bool strict_max;
	int skip_cpu;
	bool is_rtg;
	bool boosted;
	bool strict_max;
};

static inline bool prefer_spread_on_idle(int cpu)
{
#ifdef CONFIG_SCHED_WALT
	if (likely(!sysctl_sched_prefer_spread))
		return false;

	if (is_min_capacity_cpu(cpu))
		return sysctl_sched_prefer_spread >= 1;

	return sysctl_sched_prefer_spread > 1;
#else
	return false;
#endif
}

static inline void adjust_cpus_for_packing(struct task_struct *p,
				int *target_cpu, int *best_idle_cpu,
				int shallowest_idle_cstate,
@@ -3898,7 +3916,10 @@ static inline void adjust_cpus_for_packing(struct task_struct *p,
	if (*best_idle_cpu == -1 || *target_cpu == -1)
		return;

	if (task_placement_boost_enabled(p) || fbt_env->need_idle || boosted ||
	if (prefer_spread_on_idle(*best_idle_cpu))
		fbt_env->need_idle |= 2;

	if (fbt_env->need_idle || task_placement_boost_enabled(p) || boosted ||
		shallowest_idle_cstate <= 0) {
		*target_cpu = -1;
		return;
@@ -7024,6 +7045,7 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
	curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr);

	fbt_env.fastpath = 0;
	fbt_env.need_idle = need_idle;

	if (trace_sched_task_util_enabled())
		start_t = sched_clock();
@@ -7070,7 +7092,6 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
	if (sched_feat(FIND_BEST_TARGET)) {
		fbt_env.is_rtg = is_rtg;
		fbt_env.placement_boost = placement_boost;
		fbt_env.need_idle = need_idle;
		fbt_env.start_cpu = start_cpu;
		fbt_env.boosted = boosted;
		fbt_env.strict_max = is_rtg &&
@@ -7096,8 +7117,8 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
		if (p->state == TASK_WAKING)
			delta = task_util(p);
#endif
		if (task_placement_boost_enabled(p) || need_idle || boosted ||
			is_rtg || __cpu_overutilized(prev_cpu, delta) ||
		if (task_placement_boost_enabled(p) || fbt_env.need_idle ||
		    boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) ||
		    !task_fits_max(p, prev_cpu) || cpu_isolated(prev_cpu)) {
			best_energy_cpu = cpu;
			goto unlock;
@@ -7231,8 +7252,9 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,

done:
	trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu,
			sync, need_idle, fbt_env.fastpath, placement_boost,
			start_t, boosted, is_rtg, get_rtg_status(p), start_cpu);
			sync, fbt_env.need_idle, fbt_env.fastpath,
			placement_boost, start_t, boosted, is_rtg,
			get_rtg_status(p), start_cpu);

	return best_energy_cpu;

@@ -7946,6 +7968,7 @@ struct lb_env {
	unsigned int		loop;
	unsigned int		loop_break;
	unsigned int		loop_max;
	bool			prefer_spread;

	enum fbq_type		fbq_type;
	enum group_type		src_grp_type;
@@ -8119,7 +8142,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
		struct root_domain *rd = env->dst_rq->rd;

		if ((rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) &&
					env->idle == CPU_NEWLY_IDLE &&
		    env->idle == CPU_NEWLY_IDLE && !env->prefer_spread &&
		    !task_in_related_thread_group(p)) {
			long util_cum_dst, util_cum_src;
			unsigned long demand;
@@ -8289,8 +8312,12 @@ static int detach_tasks(struct lb_env *env)
		 * So only when there is other tasks can be balanced or
		 * there is situation to ignore big task, it is needed
		 * to skip the task load bigger than 2*imbalance.
		 *
		 * And load based checks are skipped for prefer_spread in
		 * finding busiest group, ignore the task's h_load.
		 */
		if (((cpu_rq(env->src_cpu)->nr_running > 2) ||
		if (!env->prefer_spread &&
			((cpu_rq(env->src_cpu)->nr_running > 2) ||
			(env->flags & LBF_IGNORE_BIG_TASKS)) &&
			((load / 2) > env->imbalance))
			goto next;
@@ -9092,6 +9119,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
	if (sgs->group_type < busiest->group_type)
		return false;

	if (env->prefer_spread && env->idle != CPU_NOT_IDLE &&
		(sgs->sum_nr_running > busiest->sum_nr_running) &&
		(sgs->group_util > busiest->group_util))
		return true;

	if (sgs->avg_load <= busiest->avg_load)
		return false;

@@ -9125,6 +9157,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
		return false;

asym_packing:

	if (env->prefer_spread &&
		(sgs->sum_nr_running < busiest->sum_nr_running))
		return false;

	/* This is the busiest node in its class. */
	if (!(env->sd->flags & SD_ASYM_PACKING))
		return true;
@@ -9605,6 +9642,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

		return fix_small_imbalance(env, sds);
	}

	/*
	 * If we couldn't find any imbalance, then boost the imbalance
	 * with the group util.
	 */
	if (env->prefer_spread && !env->imbalance &&
		env->idle != CPU_NOT_IDLE &&
		busiest->sum_nr_running > busiest->group_weight)
		env->imbalance = busiest->group_util;
}

/******* find_busiest_group() helpers end here *********************/
@@ -9998,6 +10044,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		.loop		= 0,
	};

#ifdef CONFIG_SCHED_WALT
	env.prefer_spread = (prefer_spread_on_idle(this_cpu) &&
				!((sd->flags & SD_ASYM_CPUCAPACITY) &&
				 !cpumask_test_cpu(this_cpu,
						 &asym_cap_sibling_cpus)));
#else
	env.prefer_spread = false;
#endif

	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);

	schedstat_inc(sd->lb_count[idle]);
@@ -10287,10 +10342,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
				 env.imbalance, env.flags, ld_moved,
				 sd->balance_interval, active_balance,
#ifdef CONFIG_SCHED_WALT
				 sd_overutilized(sd));
				 sd_overutilized(sd),
#else
				 READ_ONCE(this_rq->rd->overutilized));
				 READ_ONCE(this_rq->rd->overutilized),
#endif
				 env.prefer_spread);
	return ld_moved;
}

@@ -10535,7 +10591,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
		max_cost += sd->max_newidle_lb_cost;

#ifdef CONFIG_SCHED_WALT
		if (!sd_overutilized(sd))
		if (!sd_overutilized(sd) && !prefer_spread_on_idle(cpu))
			continue;
#endif

@@ -10782,7 +10838,8 @@ static void nohz_balancer_kick(struct rq *rq)
	 * happens from the tickpath.
	 */
	if (sched_energy_enabled()) {
		if (rq->nr_running >= 2 && cpu_overutilized(cpu))
		if (rq->nr_running >= 2 && (cpu_overutilized(cpu) ||
					prefer_spread_on_idle(cpu)))
			flags = NOHZ_KICK_MASK;
		goto out;
	}
@@ -11187,6 +11244,11 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
	int pulled_task = 0;
	u64 curr_cost = 0;
	u64 avg_idle = this_rq->avg_idle;
	bool prefer_spread = prefer_spread_on_idle(this_cpu);
	bool force_lb = (!is_min_capacity_cpu(this_cpu) &&
				silver_has_big_tasks() &&
				(atomic_read(&this_rq->nr_iowait) == 0));


	if (cpu_isolated(this_cpu))
		return 0;
@@ -11203,8 +11265,8 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
	 */
	if (!cpu_active(this_cpu))
		return 0;
	if (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks()
		&& (atomic_read(&this_rq->nr_iowait) == 0))

	if (force_lb || prefer_spread)
		avg_idle = ULLONG_MAX;
	/*
	 * This is OK, because current is on_cpu, which avoids it being picked
@@ -11239,6 +11301,13 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
		if (!(sd->flags & SD_LOAD_BALANCE))
			continue;

#ifdef CONFIG_SCHED_WALT
		if (prefer_spread && !force_lb &&
			(sd->flags & SD_ASYM_CPUCAPACITY) &&
			!(cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus)))
			avg_idle = this_rq->avg_idle;
#endif

		if (avg_idle < curr_cost + sd->max_newidle_lb_cost) {
			update_next_balance(sd, &next_balance);
			break;
+9 −0
Original line number Diff line number Diff line
@@ -583,6 +583,15 @@ static struct ctl_table kern_table[] = {
		.mode		= 0644,
		.proc_handler	= sched_updown_migrate_handler,
	},
	{
		.procname	= "sched_prefer_spread",
		.data		= &sysctl_sched_prefer_spread,
		.maxlen		= sizeof(unsigned int),
		.mode		= 0644,
		.proc_handler   = proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= &two,
	},
#endif
#ifdef CONFIG_SCHED_DEBUG
	{