sched/fair: Rework and comment the group_imb code (30ce5dab) · Commits · e / devices / android_kernel_fairphone_FP5

kernel/sched/fair.c

+89 −34

Original line number	Diff line number	Diff line
		@@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain sd, struct sched_group group)
		return 0;
		}

		/*
		* Group imbalance indicates (and tries to solve) the problem where balancing
		* groups is inadequate due to tsk_cpus_allowed() constraints.
		*
		* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
		* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
		* Something like:
		*
		* { 0 1 2 3 } { 4 5 6 7 }
		* * * * *
		*
		* If we were to balance group-wise we'd place two tasks in the first group and
		* two tasks in the second group. Clearly this is undesired as it will overload
		* cpu 3 and leave one of the cpus in the second group unused.
		*
		* The current solution to this issue is detecting the skew in the first group
		* by noticing it has a cpu that is overloaded while the remaining cpus are
		* idle -- or rather, there's a distinct imbalance in the cpus; see
		* sg_imbalanced().
		*
		* When this is so detected; this group becomes a candidate for busiest; see
		* update_sd_pick_busiest(). And calculcate_imbalance() and
		* find_busiest_group() avoid some of the usual balance conditional to allow it
		* to create an effective group imbalance.
		*
		* This is a somewhat tricky proposition since the next run might not find the
		* group imbalance and decide the groups need to be balanced again. A most
		* subtle and fragile situation.
		*/

		struct sg_imb_stats {
		unsigned long max_nr_running, min_nr_running;
		unsigned long max_cpu_load, min_cpu_load;
		};

		static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
		{
		sgi->max_cpu_load = sgi->max_nr_running = 0UL;
		sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
		}

		static inline void
		update_sg_imb_stats(struct sg_imb_stats *sgi,
		unsigned long load, unsigned long nr_running)
		{
		if (load > sgi->max_cpu_load)
		sgi->max_cpu_load = load;
		if (sgi->min_cpu_load > load)
		sgi->min_cpu_load = load;

		if (nr_running > sgi->max_nr_running)
		sgi->max_nr_running = nr_running;
		if (sgi->min_nr_running > nr_running)
		sgi->min_nr_running = nr_running;
		}

		static inline int
		sg_imbalanced(struct sg_lb_stats sgs, struct sg_imb_stats sgi)
		{
		/*
		* Consider the group unbalanced when the imbalance is larger
		* than the average weight of a task.
		*
		* APZ: with cgroup the avg task weight can vary wildly and
		* might not be a suitable number - should we keep a
		* normalized nr_running number somewhere that negates
		* the hierarchy?
		*/
		if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
		(sgi->max_nr_running - sgi->min_nr_running) > 1)
		return 1;

		return 0;
		}

		/**
		* update_sg_lb_stats - Update sched_group's statistics for load balancing.
		* @env: The load balancing environment.
		@@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
		struct sched_group *group, int load_idx,
		int local_group, struct sg_lb_stats *sgs)
		{
		unsigned long nr_running, max_nr_running, min_nr_running;
		unsigned long load, max_cpu_load, min_cpu_load;
		struct sg_imb_stats sgi;
		unsigned long nr_running;
		unsigned long load;
		int i;

		/* Tally up the load of all CPUs in the group */
		max_cpu_load = 0;
		min_cpu_load = ~0UL;
		max_nr_running = 0;
		min_nr_running = ~0UL;
		init_sg_imb_stats(&sgi);

		for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
		struct rq *rq = cpu_rq(i);
		@@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
		load = target_load(i, load_idx);
		} else {
		load = source_load(i, load_idx);

		if (load > max_cpu_load)
		max_cpu_load = load;
		if (min_cpu_load > load)
		min_cpu_load = load;

		if (nr_running > max_nr_running)
		max_nr_running = nr_running;
		if (min_nr_running > nr_running)
		min_nr_running = nr_running;
		update_sg_imb_stats(&sgi, load, nr_running);
		}

		sgs->group_load += load;
		@@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
		sgs->group_power = group->sgp->power;
		sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;

		/*
		* Consider the group unbalanced when the imbalance is larger
		* than the average weight of a task.
		*
		* APZ: with cgroup the avg task weight can vary wildly and
		* might not be a suitable number - should we keep a
		* normalized nr_running number somewhere that negates
		* the hierarchy?
		*/
		if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

		if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task &&
		(max_nr_running - min_nr_running) > 1)
		sgs->group_imb = 1;
		sgs->group_imb = sg_imbalanced(sgs, &sgi);

		sgs->group_capacity =
		DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
		@@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
		busiest = &sds->busiest_stat;

		if (busiest->group_imb) {
		/*
		* In the group_imb case we cannot rely on group-wide averages
		* to ensure cpu-load equilibrium, look at wider averages. XXX
		*/
		busiest->load_per_task =
		min(busiest->load_per_task, sds->avg_load);
		}
		@@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
		if (!busiest->group_imb) {
		/*
		* Don't want to pull so many tasks that a group would go idle.
		* Except of course for the group_imb case, since then we might
		* have to drop below capacity to reach cpu-load equilibrium.
		*/
		load_above_capacity =
		(busiest->sum_nr_running - busiest->group_capacity);
		@@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
		* we also don't want to reduce the group load below the group capacity
		* (so that we can implement power-savings policies etc). Thus we look
		* for the minimum possible imbalance.
		* Be careful of negative numbers as they'll appear as very large values
		* with unsigned longs.
		*/
		max_pull = min(busiest->avg_load - sds->avg_load,
		load_above_capacity);
		max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

		/* How much load to actually move to equalise the imbalance */
		env->imbalance = min(
		@@ -4881,7 +4936,7 @@ static struct sched_group find_busiest_group(struct lb_env env)

		/*
		* If the busiest group is imbalanced the below checks don't
		* work because they assumes all things are equal, which typically
		* work because they assume all things are equal, which typically
		* isn't true due to cpus_allowed constraints and the like.
		*/
		if (busiest->group_imb)