sched: Rewrite tg_shares_up) (2069dd75) · Commits · e / devices / android_kernel_teracube_2e

include/linux/sched.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
		extern unsigned int sysctl_sched_latency;
		extern unsigned int sysctl_sched_min_granularity;
		extern unsigned int sysctl_sched_wakeup_granularity;
		extern unsigned int sysctl_sched_shares_ratelimit;
		extern unsigned int sysctl_sched_shares_thresh;
		extern unsigned int sysctl_sched_child_runs_first;

		enum sched_tunable_scaling {

kernel/sched.c

+44 −129

Original line number	Diff line number	Diff line
		@@ -253,6 +253,8 @@ struct task_group {
		/* runqueue "owned" by this group on each cpu */
		struct cfs_rq **cfs_rq;
		unsigned long shares;

		atomic_t load_weight;
		#endif

		#ifdef CONFIG_RT_GROUP_SCHED
		@@ -359,15 +361,11 @@ struct cfs_rq {
		*/
		unsigned long h_load;

		/*
		* this cpu's part of tg->shares
		*/
		unsigned long shares;
		u64 load_avg;
		u64 load_period;
		u64 load_stamp;

		/*
		* load.weight at the time we set shares
		*/
		unsigned long rq_weight;
		unsigned long load_contribution;
		#endif
		#endif
		};
		@@ -806,20 +804,6 @@ late_initcall(sched_init_debug);
		*/
		const_debug unsigned int sysctl_sched_nr_migrate = 32;

		/*
		* ratelimit for updating the group shares.
		* default: 0.25ms
		*/
		unsigned int sysctl_sched_shares_ratelimit = 250000;
		unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

		/*
		* Inject some fuzzyness into changing the per-cpu group shares
		* this avoids remote rq-locks at the expense of fairness.
		* default: 4
		*/
		unsigned int sysctl_sched_shares_thresh = 4;

		/*
		* period over which we average the RT time consumption, measured
		* in ms.
		@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
		lw->inv_weight = 0;
		}

		static inline void update_load_set(struct load_weight *lw, unsigned long w)
		{
		lw->weight = w;
		lw->inv_weight = 0;
		}

		/*
		* To aid in avoiding the subversion of "niceness" due to uneven distribution
		* of tasks with abnormal "nice" values across CPUs the contribution that
		@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)

		#ifdef CONFIG_FAIR_GROUP_SCHED

		static __read_mostly unsigned long __percpu *update_shares_data;

		static void __set_se_shares(struct sched_entity *se, unsigned long shares);

		/*
		* Calculate and set the cpu's group shares.
		*/
		static void update_group_shares_cpu(struct task_group *tg, int cpu,
		unsigned long sd_shares,
		unsigned long sd_rq_weight,
		unsigned long *usd_rq_weight)
		{
		unsigned long shares, rq_weight;
		int boost = 0;

		rq_weight = usd_rq_weight[cpu];
		if (!rq_weight) {
		boost = 1;
		rq_weight = NICE_0_LOAD;
		}

		/*
		* \Sum_j shares_j * rq_weight_i
		* shares_i = -----------------------------
		* \Sum_j rq_weight_j
		*/
		shares = (sd_shares * rq_weight) / sd_rq_weight;
		shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

		if (abs(shares - tg->se[cpu]->load.weight) >
		sysctl_sched_shares_thresh) {
		struct rq *rq = cpu_rq(cpu);
		unsigned long flags;

		raw_spin_lock_irqsave(&rq->lock, flags);
		tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
		__set_se_shares(tg->se[cpu], shares);
		raw_spin_unlock_irqrestore(&rq->lock, flags);
		}
		}
		static void update_cfs_load(struct cfs_rq *cfs_rq);
		static void update_cfs_shares(struct cfs_rq *cfs_rq);

		/*
		* Re-compute the task group their per cpu shares over the given domain.
		* This needs to be done in a bottom-up fashion because the rq weight of a
		* parent group depends on the shares of its child groups.
		* update tg->load_weight by folding this cpu's load_avg
		*/
		static int tg_shares_up(struct task_group tg, void data)
		{
		unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
		unsigned long *usd_rq_weight;
		struct sched_domain *sd = data;
		long load_avg;
		struct cfs_rq *cfs_rq;
		unsigned long flags;
		int i;
		int cpu = (long)data;
		struct rq *rq;

		if (!tg->se[0])
		if (!tg->se[cpu])
		return 0;

		local_irq_save(flags);
		usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

		for_each_cpu(i, sched_domain_span(sd)) {
		weight = tg->cfs_rq[i]->load.weight;
		usd_rq_weight[i] = weight;

		rq_weight += weight;
		/*
		* If there are currently no tasks on the cpu pretend there
		* is one of average load so that when a new task gets to
		* run here it will not get delayed by group starvation.
		*/
		if (!weight)
		weight = NICE_0_LOAD;
		rq = cpu_rq(cpu);
		cfs_rq = tg->cfs_rq[cpu];

		sum_weight += weight;
		shares += tg->cfs_rq[i]->shares;
		}
		raw_spin_lock_irqsave(&rq->lock, flags);

		if (!rq_weight)
		rq_weight = sum_weight;
		update_rq_clock(rq);
		update_cfs_load(cfs_rq);

		if ((!shares && rq_weight) \|\| shares > tg->shares)
		shares = tg->shares;
		load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
		load_avg -= cfs_rq->load_contribution;

		if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
		shares = tg->shares;
		atomic_add(load_avg, &tg->load_weight);
		cfs_rq->load_contribution += load_avg;

		for_each_cpu(i, sched_domain_span(sd))
		update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
		/*
		* We need to update shares after updating tg->load_weight in
		* order to adjust the weight of groups with long running tasks.
		*/
		update_cfs_shares(cfs_rq);

		local_irq_restore(flags);
		raw_spin_unlock_irqrestore(&rq->lock, flags);

		return 0;
		}
		@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group tg, void data)
		load = cpu_rq(cpu)->load.weight;
		} else {
		load = tg->parent->cfs_rq[cpu]->h_load;
		load *= tg->cfs_rq[cpu]->shares;
		load *= tg->se[cpu]->load.weight;
		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
		}

		@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group tg, void data)
		return 0;
		}

		static void update_shares(struct sched_domain *sd)
		static void update_shares(long cpu)
		{
		s64 elapsed;
		u64 now;

		if (root_task_group_empty())
		return;

		now = local_clock();
		elapsed = now - sd->last_update;
		/*
		* XXX: replace with an on-demand list
		*/

		if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
		sd->last_update = now;
		walk_tg_tree(tg_nop, tg_shares_up, sd);
		}
		walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
		}

		static void update_h_load(long cpu)
		@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)

		#else

		static inline void update_shares(struct sched_domain *sd)
		static inline void update_shares(int cpu)
		{
		}

		@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq rq1, struct rq rq2)

		#endif

		#ifdef CONFIG_FAIR_GROUP_SCHED
		static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
		{
		#ifdef CONFIG_SMP
		cfs_rq->shares = shares;
		#endif
		}
		#endif

		static void calc_load_account_idle(struct rq *this_rq);
		static void update_sysctl(void);
		static int get_update_sysctl_factor(void);
		@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
		SET_SYSCTL(sched_min_granularity);
		SET_SYSCTL(sched_latency);
		SET_SYSCTL(sched_wakeup_granularity);
		SET_SYSCTL(sched_shares_ratelimit);
		#undef SET_SYSCTL
		}

		@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
		se->cfs_rq = parent->my_q;

		se->my_q = cfs_rq;
		se->load.weight = tg->shares;
		se->load.inv_weight = 0;
		update_load_set(&se->load, tg->shares);
		se->parent = parent;
		}
		#endif
		@@ -7881,10 +7802,6 @@ void __init sched_init(void)

		#endif /* CONFIG_CGROUP_SCHED */

		#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
		update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
		__alignof__(unsigned long));
		#endif
		for_each_possible_cpu(i) {
		struct rq *rq;

		@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
		if (on_rq)
		dequeue_entity(cfs_rq, se, 0);

		se->load.weight = shares;
		se->load.inv_weight = 0;
		update_load_set(&se->load, shares);

		if (on_rq)
		enqueue_entity(cfs_rq, se, 0);
		@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
		/*
		* force a rebalance
		*/
		cfs_rq_set_shares(tg->cfs_rq[i], 0);
		set_se_shares(tg->se[i], shares);
		}

kernel/sched_debug.c

+11 −4

Original line number	Diff line number	Diff line
		@@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file m, int cpu, struct cfs_rq cfs_rq)
		spread0 = min_vruntime - rq0_min_vruntime;
		SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
		SPLIT_NS(spread0));
		SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
		SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);

		SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
		cfs_rq->nr_spread_over);
		SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
		SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
		#ifdef CONFIG_FAIR_GROUP_SCHED
		#ifdef CONFIG_SMP
		SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
		SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
		SPLIT_NS(cfs_rq->load_avg));
		SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
		SPLIT_NS(cfs_rq->load_period));
		SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
		cfs_rq->load_contribution);
		SEQ_printf(m, " .%-30s: %d\n", "load_tg",
		atomic_read(&tg->load_weight));
		#endif

		print_cfs_group_stats(m, cpu, cfs_rq->tg);
		#endif
		}

kernel/sched_fair.c

+107 −57

Original line number	Diff line number	Diff line
		@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
		WRT_SYSCTL(sched_min_granularity);
		WRT_SYSCTL(sched_latency);
		WRT_SYSCTL(sched_wakeup_granularity);
		WRT_SYSCTL(sched_shares_ratelimit);
		#undef WRT_SYSCTL

		return 0;
		@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
		list_add(&se->group_node, &cfs_rq->tasks);
		}
		cfs_rq->nr_running++;
		se->on_rq = 1;
		}

		static void
		@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
		list_del_init(&se->group_node);
		}
		cfs_rq->nr_running--;
		se->on_rq = 0;
		}

		#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
		static void update_cfs_load(struct cfs_rq *cfs_rq)
		{
		u64 period = sched_avg_period();
		u64 now, delta;

		if (!cfs_rq)
		return;

		now = rq_of(cfs_rq)->clock;
		delta = now - cfs_rq->load_stamp;

		cfs_rq->load_stamp = now;
		cfs_rq->load_period += delta;
		cfs_rq->load_avg += delta * cfs_rq->load.weight;

		while (cfs_rq->load_period > period) {
		/*
		* Inline assembly required to prevent the compiler
		* optimising this loop into a divmod call.
		* See __iter_div_u64_rem() for another example of this.
		*/
		asm("" : "+rm" (cfs_rq->load_period));
		cfs_rq->load_period /= 2;
		cfs_rq->load_avg /= 2;
		}
		}

		static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
		unsigned long weight)
		{
		if (se->on_rq)
		account_entity_dequeue(cfs_rq, se);

		update_load_set(&se->load, weight);

		if (se->on_rq)
		account_entity_enqueue(cfs_rq, se);
		}

		static void update_cfs_shares(struct cfs_rq *cfs_rq)
		{
		struct task_group *tg;
		struct sched_entity *se;
		long load_weight, load, shares;

		if (!cfs_rq)
		return;

		tg = cfs_rq->tg;
		se = tg->se[cpu_of(rq_of(cfs_rq))];
		if (!se)
		return;

		load = cfs_rq->load.weight;

		load_weight = atomic_read(&tg->load_weight);
		load_weight -= cfs_rq->load_contribution;
		load_weight += load;

		shares = (tg->shares * load);
		if (load_weight)
		shares /= load_weight;

		if (shares < MIN_SHARES)
		shares = MIN_SHARES;
		if (shares > tg->shares)
		shares = tg->shares;

		reweight_entity(cfs_rq_of(se), se, shares);
		}
		#else /* CONFIG_FAIR_GROUP_SCHED */
		static inline void update_cfs_load(struct cfs_rq *cfs_rq)
		{
		}

		static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
		{
		}
		#endif /* CONFIG_FAIR_GROUP_SCHED */

		static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		#ifdef CONFIG_SCHEDSTATS
		@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
		* Update run-time statistics of the 'current'.
		*/
		update_curr(cfs_rq);
		update_cfs_load(cfs_rq);
		account_entity_enqueue(cfs_rq, se);
		update_cfs_shares(cfs_rq);

		if (flags & ENQUEUE_WAKEUP) {
		place_entity(cfs_rq, se, 0);
		@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
		check_spread(cfs_rq, se);
		if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
		se->on_rq = 1;
		}

		static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
		@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)

		if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
		se->on_rq = 0;
		update_cfs_load(cfs_rq);
		account_entity_dequeue(cfs_rq, se);
		update_min_vruntime(cfs_rq);
		update_cfs_shares(cfs_rq);

		/*
		* Normalize the entity after updating the min_vruntime because the
		@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
		flags = ENQUEUE_WAKEUP;
		}

		for_each_sched_entity(se) {
		struct cfs_rq *cfs_rq = cfs_rq_of(se);

		update_cfs_load(cfs_rq);
		update_cfs_shares(cfs_rq);
		}

		hrtick_update(rq);
		}

		@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
		for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight)
		break;
		flags \|= DEQUEUE_SLEEP;
		}

		for_each_sched_entity(se) {
		struct cfs_rq *cfs_rq = cfs_rq_of(se);

		update_cfs_load(cfs_rq);
		update_cfs_shares(cfs_rq);
		}

		hrtick_update(rq);
		}

		@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq rq, struct task_struct p)
		* Adding load to a group doesn't make a group heavier, but can cause movement
		* of group shares between cpus. Assuming the shares were perfectly aligned one
		* can calculate the shift in shares.
		*
		* The problem is that perfectly aligning the shares is rather expensive, hence
		* we try to avoid doing that too often - see update_shares(), which ratelimits
		* this change.
		*
		* We compensate this by not only taking the current delta into account, but
		* also considering the delta between when the shares were last adjusted and
		* now.
		*
		* We still saw a performance dip, some tracing learned us that between
		* cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
		* significantly. Therefore try to bias the error in direction of failing
		* the affine wakeup.
		*
		*/
		static long effective_load(struct task_group *tg, int cpu,
		long wl, long wg)
		static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
		{
		struct sched_entity *se = tg->se[cpu];

		if (!tg->parent)
		return wl;

		/*
		* By not taking the decrease of shares on the other cpu into
		* account our error leans towards reducing the affine wakeups.
		*/
		if (!wl && sched_feat(ASYM_EFF_LOAD))
		return wl;

		for_each_sched_entity(se) {
		long S, rw, s, a, b;
		long more_w;

		/*
		* Instead of using this increment, also add the difference
		* between when the shares were last updated and now.
		*/
		more_w = se->my_q->load.weight - se->my_q->rq_weight;
		wl += more_w;
		wg += more_w;

		S = se->my_q->tg->shares;
		s = se->my_q->shares;
		rw = se->my_q->rq_weight;
		s = se->load.weight;
		rw = se->my_q->load.weight;

		a = S*(rw + wl);
		b = Srw + swg;
		@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq rq, struct task_struct p, int sd_flag, int wake_
		sd = tmp;
		}

		#ifdef CONFIG_FAIR_GROUP_SCHED
		if (sched_feat(LB_SHARES_UPDATE)) {
		/*
		* Pick the largest domain to update shares over
		*/
		tmp = sd;
		if (affine_sd && (!tmp \|\| affine_sd->span_weight > sd->span_weight))
		tmp = affine_sd;

		if (tmp) {
		raw_spin_unlock(&rq->lock);
		update_shares(tmp);
		raw_spin_lock(&rq->lock);
		}
		}
		#endif

		if (affine_sd) {
		if (cpu == prev_cpu \|\| wake_affine(affine_sd, p, sync))
		return select_idle_sibling(p, cpu);
		@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		schedstat_inc(sd, lb_count[idle]);

		redo:
		update_shares(sd);
		group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
		cpus, balance);

		@@ -3156,8 +3206,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
		else
		ld_moved = 0;
		out:
		if (ld_moved)
		update_shares(sd);
		return ld_moved;
		}

		@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
		int update_next_balance = 0;
		int need_serialize;

		update_shares(cpu);

		for_each_domain(cpu, sd) {
		if (!(sd->flags & SD_LOAD_BALANCE))
		continue;

kernel/sched_features.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
		SCHED_FEAT(HRTICK, 0)
		SCHED_FEAT(DOUBLE_TICK, 0)
		SCHED_FEAT(LB_BIAS, 1)
		SCHED_FEAT(LB_SHARES_UPDATE, 1)
		SCHED_FEAT(ASYM_EFF_LOAD, 1)

		/*
		* Spin-wait on mutex acquisition when the mutex owner is running on