Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e35cb2f authored by Pavankumar Kondeti's avatar Pavankumar Kondeti Committed by Junjie Wu
Browse files

sched: Add separate load tracking histogram to predict loads



Current window based load tracking only saves history for five
windows. A historically heavy task's heavy load will be completely
forgotten after five windows of light load. Even before the five
window expires, a heavy task wakes up on same CPU it used to run won't
trigger any frequency change until end of the window. It would starve
for the entire window. It also adds one "small" load window to
history because it's accumulating load at a low frequency, further
reducing the tracked load for this heavy task.

Ideally, scheduler should be able to identify such tasks and notify
governor to increase frequency immediately after it wakes up.

Add a histogram for each task to track a much longer load history. A
prediction will be made based on runtime of previous or current
window, histogram data and load tracked in recent windows. Prediction
of all tasks that is currently running or runnable on a CPU is
aggregated and reported to CPUFreq governor in sched_get_cpus_busy().

sched_get_cpus_busy() now returns predicted busy time in addition
to previous window busy time and new task busy time, scaled to
the CPU maximum possible frequency.

Tunables:

- /proc/sys/kernel/sched_gov_alert_freq (KHz)

This tunable can be used to further filter the notifications.
Frequency alert notification is sent only when the predicted
load exceeds previous window load by sched_gov_alert_freq converted to
load.

Change-Id: If29098cd2c5499163ceaff18668639db76ee8504
Suggested-by: default avatarSaravana Kannan <skannan@codeaurora.org>
Signed-off-by: default avatarPavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: default avatarJoonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: default avatarJunjie Wu <junjiew@codeaurora.org>
parent ee098af9
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -1137,6 +1137,7 @@ struct sched_statistics {
#endif

#define RAVG_HIST_SIZE_MAX  5
#define NUM_BUSY_BUCKETS 10

/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
@@ -1161,6 +1162,11 @@ struct ravg {
	 *
	 * 'prev_window' represents task's contribution to cpu busy time
	 * statistics (rq->prev_runnable_sum) in previous window
	 *
	 * 'pred_demand' represents task's current predicted cpu busy time
	 *
	 * 'busy_buckets' groups historical busy time into different buckets
	 * used for prediction
	 */
	u64 mark_start;
	u32 sum, demand;
@@ -1168,6 +1174,8 @@ struct ravg {
#ifdef CONFIG_SCHED_FREQ_INPUT
	u32 curr_window, prev_window;
	u16 active_windows;
	u32 pred_demand;
	u8 busy_buckets[NUM_BUSY_BUCKETS];
#endif
};

@@ -1972,6 +1980,7 @@ extern int task_free_unregister(struct notifier_block *n);
struct sched_load {
	unsigned long prev_load;
	unsigned long new_task_load;
	unsigned long predicted_load;
};

#if defined(CONFIG_SCHED_FREQ_INPUT)
+1 −0
Original line number Diff line number Diff line
@@ -77,6 +77,7 @@ extern unsigned int sysctl_sched_enable_colocation;
extern unsigned int sysctl_sched_restrict_cluster_spill;
#if defined(CONFIG_SCHED_FREQ_INPUT)
extern unsigned int sysctl_sched_new_task_windows;
extern unsigned int sysctl_sched_pred_alert_freq;
#endif
#endif

+92 −14
Original line number Diff line number Diff line
@@ -77,6 +77,9 @@ TRACE_EVENT(sched_enq_deq_task,
		__field(unsigned int,	cpus_allowed		)
#ifdef CONFIG_SCHED_HMP
		__field(unsigned int,	demand			)
#ifdef CONFIG_SCHED_FREQ_INPUT
		__field(unsigned int,	pred_demand		)
#endif
#endif
	),

@@ -92,12 +95,18 @@ TRACE_EVENT(sched_enq_deq_task,
		__entry->cpus_allowed	= cpus_allowed;
#ifdef CONFIG_SCHED_HMP
		__entry->demand		= p->ravg.demand;
#ifdef CONFIG_SCHED_FREQ_INPUT
		__entry->pred_demand	= p->ravg.pred_demand;
#endif
#endif
	),

	TP_printk("cpu=%d %s comm=%s pid=%d prio=%d nr_running=%u cpu_load=%lu rt_nr_running=%u affine=%x"
#ifdef CONFIG_SCHED_HMP
		 " demand=%u"
#ifdef CONFIG_SCHED_FREQ_INPUT
		 " pred_demand=%u"
#endif
#endif
			, __entry->cpu,
			__entry->enqueue ? "enqueue" : "dequeue",
@@ -106,6 +115,9 @@ TRACE_EVENT(sched_enq_deq_task,
			__entry->cpu_load, __entry->rt_nr_running, __entry->cpus_allowed
#ifdef CONFIG_SCHED_HMP
			, __entry->demand
#ifdef CONFIG_SCHED_FREQ_INPUT
			, __entry->pred_demand
#endif
#endif
			)
);
@@ -282,6 +294,7 @@ TRACE_EVENT(sched_update_task_ravg,
		__field(unsigned int,	sum			)
		__field(	 int,	cpu			)
#ifdef CONFIG_SCHED_FREQ_INPUT
		__field(unsigned int,	pred_demand		)
		__field(	u64,	cs			)
		__field(	u64,	ps			)
		__field(	u32,	curr_window		)
@@ -308,6 +321,7 @@ TRACE_EVENT(sched_update_task_ravg,
		__entry->sum            = p->ravg.sum;
		__entry->irqtime        = irqtime;
#ifdef CONFIG_SCHED_FREQ_INPUT
		__entry->pred_demand     = p->ravg.pred_demand;
		__entry->cs             = rq->curr_runnable_sum;
		__entry->ps             = rq->prev_runnable_sum;
		__entry->curr_window	= p->ravg.curr_window;
@@ -320,7 +334,7 @@ TRACE_EVENT(sched_update_task_ravg,

	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
#ifdef CONFIG_SCHED_FREQ_INPUT
		" cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
		" pred_demand %u cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
#endif
		, __entry->wallclock, __entry->win_start, __entry->delta,
		task_event_names[__entry->evt], __entry->cpu,
@@ -329,8 +343,8 @@ TRACE_EVENT(sched_update_task_ravg,
		__entry->delta_m, __entry->demand,
		__entry->sum, __entry->irqtime
#ifdef CONFIG_SCHED_FREQ_INPUT
		, __entry->cs, __entry->ps, __entry->curr_window,
		  __entry->prev_window,
		, __entry->pred_demand, __entry->cs, __entry->ps,
		__entry->curr_window, __entry->prev_window,
		  __entry->nt_cs, __entry->nt_ps,
		  __entry->active_windows
#endif
@@ -351,6 +365,9 @@ TRACE_EVENT(sched_update_history,
		__field(	 int,	samples			)
		__field(enum task_event,	evt		)
		__field(unsigned int,	demand			)
#ifdef CONFIG_SCHED_FREQ_INPUT
		__field(unsigned int,	pred_demand		)
#endif
		__array(	 u32,	hist, RAVG_HIST_SIZE_MAX)
		__field(unsigned int,	nr_big_tasks		)
		__field(	 int,	cpu			)
@@ -363,18 +380,29 @@ TRACE_EVENT(sched_update_history,
		__entry->samples        = samples;
		__entry->evt            = evt;
		__entry->demand         = p->ravg.demand;
#ifdef CONFIG_SCHED_FREQ_INPUT
		__entry->pred_demand     = p->ravg.pred_demand;
#endif
		memcpy(__entry->hist, p->ravg.sum_history,
					RAVG_HIST_SIZE_MAX * sizeof(u32));
		__entry->nr_big_tasks   = rq->hmp_stats.nr_big_tasks;
		__entry->cpu            = rq->cpu;
	),

	TP_printk("%d (%s): runtime %u samples %d event %s demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u",
	TP_printk("%d (%s): runtime %u samples %d event %s demand %u"
#ifdef CONFIG_SCHED_FREQ_INPUT
		" pred_demand %u"
#endif
		" (hist: %u %u %u %u %u) cpu %d nr_big %u",
		__entry->pid, __entry->comm,
		__entry->runtime, __entry->samples,
		task_event_names[__entry->evt],
		__entry->demand, __entry->hist[0],
		__entry->hist[1], __entry->hist[2], __entry->hist[3],
		__entry->demand,
#ifdef CONFIG_SCHED_FREQ_INPUT
		__entry->pred_demand,
#endif
		__entry->hist[0], __entry->hist[1],
		__entry->hist[2], __entry->hist[3],
		__entry->hist[4], __entry->cpu, __entry->nr_big_tasks)
);

@@ -413,6 +441,43 @@ TRACE_EVENT(sched_reset_all_window_stats,

#ifdef CONFIG_SCHED_FREQ_INPUT

TRACE_EVENT(sched_update_pred_demand,

	TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int pct,
		 unsigned int pred_demand),

	TP_ARGS(rq, p, runtime, pct, pred_demand),

	TP_STRUCT__entry(
		__array(	char,	comm,   TASK_COMM_LEN	)
		__field(       pid_t,	pid			)
		__field(unsigned int,	runtime			)
		__field(	 int,	pct			)
		__field(unsigned int,	pred_demand		)
		__array(	  u8,	bucket, NUM_BUSY_BUCKETS)
		__field(	 int,	cpu			)
	),

	TP_fast_assign(
		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
		__entry->pid            = p->pid;
		__entry->runtime        = runtime;
		__entry->pct            = pct;
		__entry->pred_demand     = pred_demand;
		memcpy(__entry->bucket, p->ravg.busy_buckets,
					NUM_BUSY_BUCKETS * sizeof(u8));
		__entry->cpu            = rq->cpu;
	),

	TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)",
		__entry->pid, __entry->comm,
		__entry->runtime, __entry->pct, __entry->cpu,
		__entry->pred_demand, __entry->bucket[0], __entry->bucket[1],
		__entry->bucket[2], __entry->bucket[3],__entry->bucket[4],
		__entry->bucket[5], __entry->bucket[6], __entry->bucket[7],
		__entry->bucket[8], __entry->bucket[9])
);

TRACE_EVENT(sched_migration_update_sum,

	TP_PROTO(struct rq *rq, struct task_struct *p),
@@ -444,14 +509,15 @@ TRACE_EVENT(sched_migration_update_sum,

TRACE_EVENT(sched_get_busy,

	TP_PROTO(int cpu, u64 load, u64 nload, int early),
	TP_PROTO(int cpu, u64 load, u64 nload, u64 pload, int early),

	TP_ARGS(cpu, load, nload, early),
	TP_ARGS(cpu, load, nload, pload, early),

	TP_STRUCT__entry(
		__field(	int,	cpu			)
		__field(	u64,	load			)
		__field(	u64,	nload			)
		__field(	u64,	pload			)
		__field(	int,	early			)
	),

@@ -459,33 +525,45 @@ TRACE_EVENT(sched_get_busy,
		__entry->cpu		= cpu;
		__entry->load		= load;
		__entry->nload		= nload;
		__entry->pload		= pload;
		__entry->early		= early;
	),

	TP_printk("cpu %d load %lld new_task_load %lld early %d",
		__entry->cpu, __entry->load, __entry->nload, __entry->early)
	TP_printk("cpu %d load %lld new_task_load %lld predicted_load %lld early %d",
		__entry->cpu, __entry->load, __entry->nload,
		__entry->pload, __entry->early)
);

TRACE_EVENT(sched_freq_alert,

	TP_PROTO(int cpu, u64 old_load, u64 new_load),
	TP_PROTO(int cpu, int pd_notif, u64 old_load, u64 new_load,
		u64 old_pred, u64 new_pred),

	TP_ARGS(cpu, old_load, new_load),
	TP_ARGS(cpu, pd_notif, old_load, new_load, old_pred, new_pred),

	TP_STRUCT__entry(
		__field(	int,	cpu			)
		__field(	int,	pd_notif		)
		__field(	u64,	old_load		)
		__field(	u64,	new_load		)
		__field(	u64,	old_pred		)
		__field(	u64,	new_pred		)
	),

	TP_fast_assign(
		__entry->cpu		= cpu;
		__entry->pd_notif	= pd_notif;
		__entry->old_load	= old_load;
		__entry->new_load	= new_load;
		__entry->old_pred	= old_pred;
		__entry->new_pred	= new_pred;
	),

	TP_printk("cpu %d old_load=%llu new_load=%llu",
		__entry->cpu, __entry->old_load, __entry->new_load)
	TP_printk("cpu %d pd_notif=%d old_load=%llu new_load=%llu "
		"old_pred=%llu new_pred=%llu",
		__entry->cpu, __entry->pd_notif, __entry->old_load,
		__entry->new_load, __entry->old_pred,
		 __entry->new_pred)
);

#endif	/* CONFIG_SCHED_FREQ_INPUT */
+313 −25
Original line number Diff line number Diff line
@@ -1656,8 +1656,6 @@ static __read_mostly unsigned int sched_window_stats_policy =
__read_mostly unsigned int sysctl_sched_window_stats_policy =
	WINDOW_STATS_MAX_RECENT_AVG;

__read_mostly unsigned int sysctl_sched_new_task_windows = 5;

static __read_mostly unsigned int sched_account_wait_time = 1;
__read_mostly unsigned int sysctl_sched_account_wait_time = 1;

@@ -1667,6 +1665,8 @@ unsigned int __read_mostly sysctl_sched_enable_colocation = 1;

#ifdef CONFIG_SCHED_FREQ_INPUT

__read_mostly unsigned int sysctl_sched_new_task_windows = 5;

static __read_mostly unsigned int sched_migration_fixup = 1;
__read_mostly unsigned int sysctl_sched_migration_fixup = 1;

@@ -1686,6 +1686,9 @@ __read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */

static __read_mostly unsigned int sched_io_is_busy;

__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;

#endif	/* CONFIG_SCHED_FREQ_INPUT */

/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
@@ -1723,6 +1726,15 @@ __read_mostly unsigned int sched_ravg_window = 10000000;
/* Temporarily disable window-stats activity on all cpus */
unsigned int __read_mostly sched_disable_window_stats;

/*
 * Major task runtime. If a task runs for more than sched_major_task_runtime
 * in a window, it's considered to be generating majority of workload
 * for this window. Prediction could be adjusted for such tasks.
 */
#ifdef CONFIG_SCHED_FREQ_INPUT
__read_mostly unsigned int sched_major_task_runtime = 10000000;
#endif

static unsigned int sync_cpu;

#define EXITING_TASK_MARKER	0xdeaddead
@@ -1820,7 +1832,7 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
}

/* Should scheduler alert governor for changing frequency? */
static int send_notification(struct rq *rq)
static int send_notification(struct rq *rq, int check_pred)
{
	unsigned int cur_freq, freq_required;
	unsigned long flags;
@@ -1829,11 +1841,29 @@ static int send_notification(struct rq *rq)
	if (!sched_enable_hmp)
		return 0;

	if (check_pred) {
		u64 prev = rq->old_busy_time;
		u64 predicted = rq->hmp_stats.pred_demands_sum;

		if (rq->cluster->cur_freq == rq->cluster->max_freq)
			return 0;

		prev = max(prev, rq->old_estimated_time);
		if (prev > predicted)
			return 0;

		cur_freq = load_to_freq(rq, prev);
		freq_required = load_to_freq(rq, predicted);

		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
			return 0;
	} else {
		cur_freq = load_to_freq(rq, rq->old_busy_time);
		freq_required = load_to_freq(rq, rq->prev_runnable_sum);

		if (nearly_same_freq(cur_freq, freq_required))
			return 0;
	}

	raw_spin_lock_irqsave(&rq->lock, flags);
	if (!rq->notifier_sent) {
@@ -1846,14 +1876,16 @@ static int send_notification(struct rq *rq)
}

/* Alert governor if there is a need to change frequency */
void check_for_freq_change(struct rq *rq)
void check_for_freq_change(struct rq *rq, bool check_pred)
{
	int cpu = cpu_of(rq);

	if (!send_notification(rq))
	if (!send_notification(rq, check_pred))
		return;

	trace_sched_freq_alert(cpu, rq->old_busy_time, rq->prev_runnable_sum);
	trace_sched_freq_alert(cpu, check_pred, rq->old_busy_time,
			rq->prev_runnable_sum, rq->old_estimated_time,
			rq->hmp_stats.pred_demands_sum);

	atomic_notifier_call_chain(
		&load_alert_notifier_head, 0,
@@ -1904,6 +1936,223 @@ static inline bool is_new_task(struct task_struct *p)
	return p->ravg.active_windows < sysctl_sched_new_task_windows;
}

#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
#define INC_STEP_BIG 16
/*
 * bucket_increase - update the count of all buckets
 *
 * @buckets: array of buckets tracking busy time of a task
 * @idx: the index of bucket to be incremented
 *
 * Each time a complete window finishes, count of bucket that runtime
 * falls in (@idx) is incremented. Counts of all other buckets are
 * decayed. The rate of increase and decay could be different based
 * on current count in the bucket.
 */
static inline void bucket_increase(u8 *buckets, int idx)
{
	int i, step;

	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
		if (idx != i) {
			if (buckets[i] > DEC_STEP)
				buckets[i] -= DEC_STEP;
			else
				buckets[i] = 0;
		} else {
			step = buckets[i] >= CONSISTENT_THRES ?
						INC_STEP_BIG : INC_STEP;
			if (buckets[i] > U8_MAX - step)
				buckets[i] = U8_MAX;
			else
				buckets[i] += step;
		}
	}
}

static inline int busy_to_bucket(u32 normalized_rt)
{
	int bidx;

	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);

	/*
	 * Combine lowest two buckets. The lowest frequency falls into
	 * 2nd bucket and thus keep predicting lowest bucket is not
	 * useful.
	 */
	if (!bidx)
		bidx++;

	return bidx;
}

static inline u64
scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
{
	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
}

#define HEAVY_TASK_SKIP 2
#define HEAVY_TASK_SKIP_LIMIT 4
/*
 * get_pred_busy - calculate predicted demand for a task on runqueue
 *
 * @rq: runqueue of task p
 * @p: task whose prediction is being updated
 * @start: starting bucket. returned prediction should not be lower than
 *         this bucket.
 * @runtime: runtime of the task. returned prediction should not be lower
 *           than this runtime.
 * Note: @start can be derived from @runtime. It's passed in only to
 * avoid duplicated calculation in some cases.
 *
 * A new predicted busy time is returned for task @p based on @runtime
 * passed in. The function searches through buckets that represent busy
 * time equal to or bigger than @runtime and attempts to find the bucket to
 * to use for prediction. Once found, it searches through historical busy
 * time and returns the latest that falls into the bucket. If no such busy
 * time exists, it returns the medium of that bucket.
 */
static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
				int start, u32 runtime)
{
	int i;
	u8 *buckets = p->ravg.busy_buckets;
	u32 *hist = p->ravg.sum_history;
	u32 dmin, dmax;
	u64 cur_freq_runtime = 0;
	int first = NUM_BUSY_BUCKETS, final, skip_to;
	u32 ret = runtime;

	/* skip prediction for new tasks due to lack of history */
	if (unlikely(is_new_task(p)))
		goto out;

	/* find minimal bucket index to pick */
	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
		if (buckets[i]) {
			first = i;
			break;
		}
	}
	/* if no higher buckets are filled, predict runtime */
	if (first >= NUM_BUSY_BUCKETS)
		goto out;

	/* compute the bucket for prediction */
	final = first;
	if (first < HEAVY_TASK_SKIP_LIMIT) {
		/* compute runtime at current CPU frequency */
		cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
					     rq->cluster->efficiency);
		cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
				max_possible_freq, rq->cluster->cur_freq);
		/*
		 * if the task runs for majority of the window, try to
		 * pick higher buckets.
		 */
		if (cur_freq_runtime >= sched_major_task_runtime) {
			int next = NUM_BUSY_BUCKETS;
			/*
			 * if there is a higher bucket that's consistently
			 * hit, don't jump beyond that.
			 */
			for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
			     i < NUM_BUSY_BUCKETS; i++) {
				if (buckets[i] > CONSISTENT_THRES) {
					next = i;
					break;
				}
			}
			skip_to = min(next, start + HEAVY_TASK_SKIP);
			/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
			skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
			/* don't go below first non-empty bucket, if any */
			final = max(first, skip_to);
		}
	}

	/* determine demand range for the predicted bucket */
	if (final < 2) {
		/* lowest two buckets are combined */
		dmin = 0;
		final = 1;
	} else {
		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
	}
	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);

	/*
	 * search through runtime history and return first runtime that falls
	 * into the range of predicted bucket.
	 */
	for (i = 0; i < sched_ravg_hist_size; i++) {
		if (hist[i] >= dmin && hist[i] < dmax) {
			ret = hist[i];
			break;
		}
	}
	/* no historical runtime within bucket found, use average of the bin */
	if (ret < dmin)
		ret = (dmin + dmax) / 2;
	/*
	 * when updating in middle of a window, runtime could be higher
	 * than all recorded history. Always predict at least runtime.
	 */
	ret = max(runtime, ret);
out:
	trace_sched_update_pred_demand(rq, p, runtime,
		mult_frac((unsigned int)cur_freq_runtime, 100,
			  sched_ravg_window), ret);
	return ret;
}

static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
{
	if (p->ravg.pred_demand >= p->ravg.curr_window)
		return p->ravg.pred_demand;

	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
			     p->ravg.curr_window);
}

/*
 * predictive demand of a task is calculated at the window roll-over.
 * if the task current window busy time exceeds the predicted
 * demand, update it here to reflect the task needs.
 */
void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
{
	u32 new, old;

	if (is_idle_task(p) || exiting_task(p))
		return;

	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
			(!sched_freq_account_wait_time ||
			 (event != TASK_MIGRATE &&
			 event != PICK_NEXT_TASK)))
		return;

	new = calc_pred_demand(rq, p);
	old = p->ravg.pred_demand;

	if (old >= new)
		return;

	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
				!p->dl.dl_throttled))
		p->sched_class->fixup_hmp_sched_stats(rq, p,
				p->ravg.demand,
				new);

	p->ravg.pred_demand = new;
}

/*
 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
 */
@@ -2239,13 +2488,40 @@ fail:
	spin_unlock_irqrestore(&freq_max_load_lock, flags);
	return ret;
}

static inline u32 predict_and_update_buckets(struct rq *rq,
			struct task_struct *p, u32 runtime) {

	int bidx;
	u32 pred_demand;

	bidx = busy_to_bucket(runtime);
	pred_demand = get_pred_busy(rq, p, bidx, runtime);
	bucket_increase(p->ravg.busy_buckets, bidx);

	return pred_demand;
}
#define assign_ravg_pred_demand(x) (p->ravg.pred_demand = x)

#else	/* CONFIG_SCHED_FREQ_INPUT */

static inline void
update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
{
}

static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
	     int event, u64 wallclock, u64 irqtime)
{
}

static inline u32 predict_and_update_buckets(struct rq *rq,
			struct task_struct *p, u32 runtime)
{
	return 0;
}
#define assign_ravg_pred_demand(x)

#endif	/* CONFIG_SCHED_FREQ_INPUT */

static int account_busy_for_task_demand(struct task_struct *p, int event)
@@ -2277,7 +2553,7 @@ static void update_history(struct rq *rq, struct task_struct *p,
{
	u32 *hist = &p->ravg.sum_history[0];
	int ridx, widx;
	u32 max = 0, avg, demand;
	u32 max = 0, avg, demand, pred_demand;
	u64 sum = 0;

	/* Ignore windows where task had no activity */
@@ -2314,6 +2590,7 @@ static void update_history(struct rq *rq, struct task_struct *p,
		else
			demand = max(avg, runtime);
	}
	pred_demand = predict_and_update_buckets(rq, p, runtime);

	/*
	 * A throttled deadline sched class task gets dequeued without
@@ -2322,9 +2599,11 @@ static void update_history(struct rq *rq, struct task_struct *p,
	 */
	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
						!p->dl.dl_throttled))
		p->sched_class->fixup_hmp_sched_stats(rq, p, demand);
		p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
						      pred_demand);

	p->ravg.demand = demand;
	assign_ravg_pred_demand(pred_demand);

done:
	trace_sched_update_history(rq, p, runtime, samples, event);
@@ -2457,7 +2736,7 @@ static void update_task_ravg(struct task_struct *p, struct rq *rq,

	update_task_demand(p, rq, event, wallclock);
	update_cpu_busy_time(p, rq, event, wallclock, irqtime);

	update_task_pred_demand(rq, p, event);
done:
	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime);

@@ -2733,12 +3012,6 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)

#ifdef CONFIG_SCHED_FREQ_INPUT

static inline u64
scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
{
	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
}

void sched_get_cpus_busy(struct sched_load *busy,
			 const struct cpumask *query_cpus)
{
@@ -2746,6 +3019,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
	struct rq *rq;
	const int cpus = cpumask_weight(query_cpus);
	u64 load[cpus], nload[cpus];
	u64 pload[cpus];
	unsigned int cur_freq[cpus], max_freq[cpus];
	int notifier_sent[cpus];
	int early_detection[cpus];
@@ -2773,6 +3047,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
				 sched_ktime_clock(), 0);
		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
		nload[i] = rq->nt_prev_runnable_sum;
		pload[i] = rq->hmp_stats.pred_demands_sum;
		rq->old_estimated_time = pload[i];
		/*
		 * Scale load in reference to cluster max_possible_freq.
		 *
@@ -2781,6 +3057,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
		 */
		load[i] = scale_load_to_cpu(load[i], cpu);
		nload[i] = scale_load_to_cpu(nload[i], cpu);
		pload[i] = scale_load_to_cpu(pload[i], cpu);

		notifier_sent[i] = rq->notifier_sent;
		early_detection[i] = (rq->ed_task != NULL);
@@ -2825,13 +3102,18 @@ void sched_get_cpus_busy(struct sched_load *busy,
			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
						    cpu_max_possible_freq(cpu));
		}
		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
					     rq->cluster->max_possible_freq);

		busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
		busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
		busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);

exit_early:
		trace_sched_get_busy(cpu, busy[i].prev_load,
				     busy[i].new_task_load, early_detection[i]);
				     busy[i].new_task_load,
				     busy[i].predicted_load,
				     early_detection[i]);
		i++;
	}
}
@@ -4190,10 +4472,12 @@ out:

	if (freq_notif_allowed) {
		if (!same_freq_domain(src_cpu, cpu)) {
			check_for_freq_change(cpu_rq(cpu));
			check_for_freq_change(cpu_rq(src_cpu));
			check_for_freq_change(cpu_rq(cpu), false);
			check_for_freq_change(cpu_rq(src_cpu), false);
		} else if (heavy_task) {
			check_for_freq_change(cpu_rq(cpu));
			check_for_freq_change(cpu_rq(cpu), false);
		} else if (success) {
			check_for_freq_change(cpu_rq(cpu), true);
		}
	}

@@ -7383,8 +7667,10 @@ fail:
	raw_spin_unlock(&rq->lock);
	raw_spin_unlock(&p->pi_lock);
	if (moved && !same_freq_domain(src_cpu, dest_cpu)) {
		check_for_freq_change(cpu_rq(src_cpu));
		check_for_freq_change(cpu_rq(dest_cpu));
		check_for_freq_change(cpu_rq(src_cpu), false);
		check_for_freq_change(cpu_rq(dest_cpu), false);
	} else if (moved) {
		check_for_freq_change(cpu_rq(dest_cpu), true);
	}
	if (moved && task_notify_on_migrate(p)) {
		struct migration_notify_data mnd;
@@ -9766,7 +10052,9 @@ void __init sched_init(void)
		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
		rq->old_busy_time = 0;
		rq->old_estimated_time = 0;
		rq->notifier_sent = 0;
		rq->hmp_stats.pred_demands_sum = 0;
#endif
#endif
		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+4 −2
Original line number Diff line number Diff line
@@ -749,11 +749,13 @@ fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p,
#else
static void
fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p,
			 u32 new_task_load)
			 u32 new_task_load, u32 new_pred_demand)
{
	s64 task_load_delta = (s64)new_task_load - task_load(p);
	s64 pred_demand_delta = PRED_DEMAND_DELTA;

	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta);
	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
				      pred_demand_delta);
}
#endif

Loading