Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0bb35550 authored by Srivatsa Vaddagiri's avatar Srivatsa Vaddagiri Committed by Chris Redpath
Browse files

ANDROID: sched: Introduce Window Assisted Load Tracking (WALT)



This patch is a combination of many many patches which have
been previously applied to Android/EAS kernels. Similarly to
other EAS components, we are squashing these to present a more
orderly view of component history and relationships.

The original description of WALT was:

Use a window based view of time in order to track task demand
and CPU utilization in the scheduler.

WALT accounts two major statistics; CPU load and cumulative tasks
demand.

The CPU load which is account of accumulated each CPU's absolute
execution time is for CPU frequency guidance.  Whereas cumulative
tasks demand which is each CPU's instantaneous load to reflect
CPU's load at given time is for task placement decision.

Use cumulative tasks demand for cpu_util() for task placement and
introduce cpu_util_freq() for frequency guidance.

This version includes the "cumulative window demand" statistic
which was originally described as:

Energy cost estimation has been a long lasting challenge for WALT
because WALT guides CPU frequency based on the CPU utilization of
previous window.  Consequently it's not possible to know newly
waking-up task's energy cost until WALT's end of the current window.

The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum)
and 'Cumulative Runnable Average' (cr_avg).  They are designed for
CPU frequency guidance and task placement but unfortunately both
are not suitable for the energy cost estimation.

It's because using prev_runnable_sum for energy cost calculation would
make us to account CPU and task's energy solely based on activity in the
previous window so for example, any task didn't have an activity in the
previous window will be accounted as a 'zero energy cost' task.
Energy estimation with cr_avg is what energy_diff() relies on at present.
However cr_avg can only represent instantaneous picture of energy cost
thus for example, if a CPU was fully occupied for an entire WALT window
and became idle just before window boundary, and if there is a wake-up,
energy_diff() accounts that CPU is a 'zero energy cost' CPU.

As a result, introduce a new accounting unit 'Cumulative Window Demand'.
The cumulative window demand tracks all the tasks' demands have seen in
current window which is neither instantaneous nor actual execution time.
Because task demand represents estimated scaled execution time when the
task runs a full window, accumulation of all the demands represents
predicted CPU load at the end of window.

Thus we can estimate CPU's frequency at the end of current WALT window
with the cumulative window demand.

This version is extracted wholesale from the version currently
available in android-4.4 and android-4.9.

Window Assisted Load Tracking (WALT) implementation credits:
 Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa,
 Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan,
 Srinath Sridharan, Vikram Mulukutla, Todd Kjos, Juri Lelli,
 John Stultz, Andres Oportus

Change-Id: If92dd9db843374073be59d2cb83febfef993b562
Signed-off-by: default avatarChris Redpath <chris.redpath@arm.com>
parent f240e444
Loading
Loading
Loading
Loading
+54 −0
Original line number Diff line number Diff line
@@ -166,6 +166,15 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN			16

enum task_event {
	PUT_PREV_TASK   = 0,
	PICK_NEXT_TASK  = 1,
	TASK_WAKE       = 2,
	TASK_MIGRATE    = 3,
	TASK_UPDATE     = 4,
	IRQ_UPDATE	= 5,
};

extern cpumask_var_t			cpu_isolated_map;

extern void scheduler_tick(void);
@@ -410,6 +419,41 @@ struct sched_entity {
#endif
};

#ifdef CONFIG_SCHED_WALT
#define RAVG_HIST_SIZE_MAX  5

/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
	/*
	 * 'mark_start' marks the beginning of an event (task waking up, task
	 * starting to execute, task being preempted) within a window
	 *
	 * 'sum' represents how runnable a task has been within current
	 * window. It incorporates both running time and wait time and is
	 * frequency scaled.
	 *
	 * 'sum_history' keeps track of history of 'sum' seen over previous
	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
	 * ignored.
	 *
	 * 'demand' represents maximum sum seen over previous
	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
	 * demand for tasks.
	 *
	 * 'curr_window' represents task's contribution to cpu busy time
	 * statistics (rq->curr_runnable_sum) in current window
	 *
	 * 'prev_window' represents task's contribution to cpu busy time
	 * statistics (rq->prev_runnable_sum) in previous window
	 */
	u64 mark_start;
	u32 sum, demand;
	u32 sum_history[RAVG_HIST_SIZE_MAX];
	u32 curr_window, prev_window;
	u16 active_windows;
};
#endif

struct sched_rt_entity {
	struct list_head		run_list;
	unsigned long			timeout;
@@ -562,6 +606,16 @@ struct task_struct {
	const struct sched_class	*sched_class;
	struct sched_entity		se;
	struct sched_rt_entity		rt;
#ifdef CONFIG_SCHED_WALT
	struct ravg ravg;
	/*
	 * 'init_load_pct' represents the initial task load assigned to children
	 * of this task
	 */
	u32 init_load_pct;
	u64 last_sleep_ts;
#endif

#ifdef CONFIG_CGROUP_SCHED
	struct task_group		*sched_task_group;
#endif
+6 −0
Original line number Diff line number Diff line
@@ -25,6 +25,12 @@ extern unsigned int sysctl_sched_sync_hint_enable;
extern unsigned int sysctl_sched_cstate_aware;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
#ifdef CONFIG_SCHED_WALT
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int sysctl_sched_use_walt_task_util;
extern unsigned int sysctl_sched_walt_init_task_load_pct;
extern unsigned int sysctl_sched_walt_cpu_high_irqload;
#endif

enum sched_tunable_scaling {
	SCHED_TUNABLESCALING_NONE,
+219 −2
Original line number Diff line number Diff line
@@ -667,6 +667,52 @@ TRACE_EVENT(sched_load_rt_rq,
		  __entry->util)
);

#ifdef CONFIG_SCHED_WALT
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int sysctl_sched_use_walt_task_util;
extern unsigned int walt_ravg_window;
extern bool walt_disabled;
#endif

/*
 * Tracepoint for accounting cpu root cfs_rq
 */
TRACE_EVENT(sched_load_avg_cpu,

        TP_PROTO(int cpu, struct cfs_rq *cfs_rq),

        TP_ARGS(cpu, cfs_rq),

        TP_STRUCT__entry(
                __field( int,   cpu                             )
                __field( unsigned long, load_avg                )
                __field( unsigned long, util_avg                )
                __field( unsigned long, util_avg_pelt           )
                __field( unsigned long, util_avg_walt           )
        ),

        TP_fast_assign(
                __entry->cpu                    = cpu;
                __entry->load_avg               = cfs_rq->avg.load_avg;
                __entry->util_avg               = cfs_rq->avg.util_avg;
                __entry->util_avg_pelt  = cfs_rq->avg.util_avg;
                __entry->util_avg_walt  = 0;
#ifdef CONFIG_SCHED_WALT
                __entry->util_avg_walt  =
                                cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
                do_div(__entry->util_avg_walt, walt_ravg_window);
                if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
                        __entry->util_avg               = __entry->util_avg_walt;
#endif
        ),

        TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
                          "util_avg_pelt=%lu util_avg_walt=%lu",
                  __entry->cpu, __entry->load_avg, __entry->util_avg,
                  __entry->util_avg_pelt, __entry->util_avg_walt)
);


/*
 * Tracepoint for sched_entity load tracking:
 */
@@ -684,6 +730,8 @@ TRACE_EVENT(sched_load_se,
		__field(	pid_t,		pid			      )
		__field(	unsigned long,	load			      )
		__field(	unsigned long,	util			      )
		__field(	unsigned long,	util_pelt		      )
		__field(	unsigned long,	util_walt		      )
	),

	TP_fast_assign(
@@ -698,11 +746,23 @@ TRACE_EVENT(sched_load_se,
		__entry->pid = p ? p->pid : -1;
		__entry->load = se->avg.load_avg;
		__entry->util = se->avg.util_avg;
		__entry->util_pelt  = __entry->util;
		__entry->util_walt  = 0;
#ifdef CONFIG_SCHED_WALT
		if (!se->my_q) {
			struct task_struct *p = container_of(se, struct task_struct, se);
			__entry->util_walt = p->ravg.demand;
			do_div(__entry->util_walt, walt_ravg_window >> SCHED_CAPACITY_SHIFT);
			if (!walt_disabled && sysctl_sched_use_walt_task_util)
				__entry->util = __entry->util_walt;
		}
#endif
	),

	TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu",
	TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu util_pelt=%lu util_walt=%lu",
		  __entry->cpu, __get_str(path), __entry->comm,
		  __entry->pid, __entry->load, __entry->util)
		  __entry->pid, __entry->load, __entry->util,
		  __entry->util_pelt, __entry->util_walt)
);

/*
@@ -921,6 +981,163 @@ TRACE_EVENT(sched_find_best_target,
		__entry->target)
);

#ifdef CONFIG_SCHED_WALT
struct rq;

TRACE_EVENT(walt_update_task_ravg,

	TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
						u64 wallclock, u64 irqtime),

	TP_ARGS(p, rq, evt, wallclock, irqtime),

	TP_STRUCT__entry(
		__array(	char,	comm,   TASK_COMM_LEN	)
		__field(	pid_t,	pid			)
		__field(	pid_t,	cur_pid			)
		__field(	u64,	wallclock		)
		__field(	u64,	mark_start		)
		__field(	u64,	delta_m			)
		__field(	u64,	win_start		)
		__field(	u64,	delta			)
		__field(	u64,	irqtime			)
		__array(    char,   evt, 16			)
		__field(unsigned int,	demand			)
		__field(unsigned int,	sum			)
		__field(	 int,	cpu			)
		__field(	u64,	cs			)
		__field(	u64,	ps			)
		__field(	u32,	curr_window		)
		__field(	u32,	prev_window		)
		__field(	u64,	nt_cs			)
		__field(	u64,	nt_ps			)
		__field(	u32,	active_windows		)
	),

	TP_fast_assign(
			static const char* walt_event_names[] =
			{
				"PUT_PREV_TASK",
				"PICK_NEXT_TASK",
				"TASK_WAKE",
				"TASK_MIGRATE",
				"TASK_UPDATE",
				"IRQ_UPDATE"
			};
		__entry->wallclock      = wallclock;
		__entry->win_start      = rq->window_start;
		__entry->delta          = (wallclock - rq->window_start);
		strcpy(__entry->evt, walt_event_names[evt]);
		__entry->cpu            = rq->cpu;
		__entry->cur_pid        = rq->curr->pid;
		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
		__entry->pid            = p->pid;
		__entry->mark_start     = p->ravg.mark_start;
		__entry->delta_m        = (wallclock - p->ravg.mark_start);
		__entry->demand         = p->ravg.demand;
		__entry->sum            = p->ravg.sum;
		__entry->irqtime        = irqtime;
		__entry->cs             = rq->curr_runnable_sum;
		__entry->ps             = rq->prev_runnable_sum;
		__entry->curr_window	= p->ravg.curr_window;
		__entry->prev_window	= p->ravg.prev_window;
		__entry->nt_cs		= rq->nt_curr_runnable_sum;
		__entry->nt_ps		= rq->nt_prev_runnable_sum;
		__entry->active_windows	= p->ravg.active_windows;
	),

	TP_printk("wallclock=%llu window_start=%llu delta=%llu event=%s cpu=%d cur_pid=%d pid=%d comm=%s"
		" mark_start=%llu delta=%llu demand=%u sum=%u irqtime=%llu"
		" curr_runnable_sum=%llu prev_runnable_sum=%llu cur_window=%u"
		" prev_window=%u nt_curr_runnable_sum=%llu nt_prev_runnable_sum=%llu active_windows=%u",
		__entry->wallclock, __entry->win_start, __entry->delta,
		__entry->evt, __entry->cpu, __entry->cur_pid,
		__entry->pid, __entry->comm, __entry->mark_start,
		__entry->delta_m, __entry->demand,
		__entry->sum, __entry->irqtime,
		__entry->cs, __entry->ps,
		__entry->curr_window, __entry->prev_window,
		__entry->nt_cs, __entry->nt_ps,
		__entry->active_windows
		)
);

TRACE_EVENT(walt_update_history,

	TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
			int evt),

	TP_ARGS(rq, p, runtime, samples, evt),

	TP_STRUCT__entry(
		__array(	char,	comm,   TASK_COMM_LEN	)
		__field(	pid_t,	pid			)
		__field(unsigned int,	runtime			)
		__field(	 int,	samples			)
		__field(	 int,	evt			)
		__field(	 u64,	demand			)
		__field(unsigned int,	walt_avg		)
		__field(unsigned int,	pelt_avg		)
		__array(	 u32,	hist, RAVG_HIST_SIZE_MAX)
		__field(	 int,	cpu			)
	),

	TP_fast_assign(
		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
		__entry->pid            = p->pid;
		__entry->runtime        = runtime;
		__entry->samples        = samples;
		__entry->evt            = evt;
		__entry->demand         = p->ravg.demand;
		__entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
		__entry->pelt_avg	= p->se.avg.util_avg;
		memcpy(__entry->hist, p->ravg.sum_history,
					RAVG_HIST_SIZE_MAX * sizeof(u32));
		__entry->cpu            = rq->cpu;
	),

	TP_printk("pid=%d comm=%s runtime=%u samples=%d event=%d demand=%llu ravg_window=%u"
		" walt=%u pelt=%u hist0=%u hist1=%u hist2=%u hist3=%u hist4=%u cpu=%d",
		__entry->pid, __entry->comm,
		__entry->runtime, __entry->samples, __entry->evt,
		__entry->demand,
		walt_ravg_window,
		__entry->walt_avg,
		__entry->pelt_avg,
		__entry->hist[0], __entry->hist[1],
		__entry->hist[2], __entry->hist[3],
		__entry->hist[4], __entry->cpu)
);

TRACE_EVENT(walt_migration_update_sum,

	TP_PROTO(struct rq *rq, struct task_struct *p),

	TP_ARGS(rq, p),

	TP_STRUCT__entry(
		__field(int,		cpu			)
		__field(int,		pid			)
		__field(	u64,	cs			)
		__field(	u64,	ps			)
		__field(	s64,	nt_cs			)
		__field(	s64,	nt_ps			)
	),

	TP_fast_assign(
		__entry->cpu		= cpu_of(rq);
		__entry->cs		= rq->curr_runnable_sum;
		__entry->ps		= rq->prev_runnable_sum;
		__entry->nt_cs		= (s64)rq->nt_curr_runnable_sum;
		__entry->nt_ps		= (s64)rq->nt_prev_runnable_sum;
		__entry->pid		= p->pid;
	),

	TP_printk("cpu=%d curr_runnable_sum=%llu prev_runnable_sum=%llu nt_curr_runnable_sum=%lld nt_prev_runnable_sum=%lld pid=%d",
		  __entry->cpu, __entry->cs, __entry->ps,
		  __entry->nt_cs, __entry->nt_ps, __entry->pid)
);
#endif /* CONFIG_SCHED_WALT */
#endif /* CONFIG_SMP */
#endif /* _TRACE_SCHED_H */

+9 −0
Original line number Diff line number Diff line
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING

	  If in doubt, say N here.

config SCHED_WALT
        bool "Support window based load tracking"
        depends on SMP
        help
        This feature will allow the scheduler to maintain a tunable window
	based set of metrics for tasks and runqueues. These metrics can be
	used to guide task placement as well as task frequency requirements
	for cpufreq governors.

config BSD_PROCESS_ACCT
	bool "BSD Process Accounting"
	depends on MULTIUSER
+1 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ obj-y += idle_task.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += energy.o
obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
Loading