Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ec4e0e2f authored by Ken Chen's avatar Ken Chen Committed by Ingo Molnar
Browse files

sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares



Impact: make load-balancing more consistent

In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate.  The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time.  Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.

This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares.  This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.

Signed-off-by: default avatarKen Chen <kenchen@google.com>
Acked-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 3ac3ba0b
Loading
Loading
Loading
Loading
+15 −26
Original line number Diff line number Diff line
@@ -1453,27 +1453,13 @@ static void
update_group_shares_cpu(struct task_group *tg, int cpu,
			unsigned long sd_shares, unsigned long sd_rq_weight)
{
	int boost = 0;
	unsigned long shares;
	unsigned long rq_weight;

	if (!tg->se[cpu])
		return;

	rq_weight = tg->cfs_rq[cpu]->load.weight;

	/*
	 * If there are currently no tasks on the cpu pretend there is one of
	 * average load so that when a new task gets to run here it will not
	 * get delayed by group starvation.
	 */
	if (!rq_weight) {
		boost = 1;
		rq_weight = NICE_0_LOAD;
	}

	if (unlikely(rq_weight > sd_rq_weight))
		rq_weight = sd_rq_weight;
	rq_weight = tg->cfs_rq[cpu]->rq_weight;

	/*
	 *           \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
	 *               \Sum rq_weight
	 *
	 */
	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
	shares = (sd_shares * rq_weight) / sd_rq_weight;
	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

	if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
		unsigned long flags;

		spin_lock_irqsave(&rq->lock, flags);
		/*
		 * record the actual number of shares, not the boosted amount.
		 */
		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
		tg->cfs_rq[cpu]->rq_weight = rq_weight;
		tg->cfs_rq[cpu]->shares = shares;

		__set_se_shares(tg->se[cpu], shares);
		spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
static int tg_shares_up(struct task_group *tg, void *data)
{
	unsigned long rq_weight = 0;
	unsigned long weight, rq_weight = 0;
	unsigned long shares = 0;
	struct sched_domain *sd = data;
	int i;

	for_each_cpu_mask(i, sd->span) {
		rq_weight += tg->cfs_rq[i]->load.weight;
		/*
		 * If there are currently no tasks on the cpu pretend there
		 * is one of average load so that when a new task gets to
		 * run here it will not get delayed by group starvation.
		 */
		weight = tg->cfs_rq[i]->load.weight;
		if (!weight)
			weight = NICE_0_LOAD;

		tg->cfs_rq[i]->rq_weight = weight;
		rq_weight += weight;
		shares += tg->cfs_rq[i]->shares;
	}

@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
		shares = tg->shares;

	if (!rq_weight)
		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;

	for_each_cpu_mask(i, sd->span)
		update_group_shares_cpu(tg, i, shares, rq_weight);