Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 49394a2a authored by David Ahern's avatar David Ahern Committed by Arnaldo Carvalho de Melo
Browse files

perf sched timehist: Introduce timehist command



'perf sched timehist' provides an analysis of scheduling events.

Example usage:
    perf sched record -- sleep 1
    perf sched timehist

By default it shows the individual schedule events, including the wait
time (time between sched-out and next sched-in events for the task), the
task scheduling delay (time between wakeup and actually running) and run
time for the task:

            time    cpu  task name             wait time  sch delay   run time
                         [tid/pid]                (msec)     (msec)     (msec)
  -------------- ------  --------------------  ---------  ---------  ---------
    79371.874569 [0011]  gcc[31949]                0.014      0.000      1.148
    79371.874591 [0010]  gcc[31951]                0.000      0.000      0.024
    79371.874603 [0010]  migration/10[59]          3.350      0.004      0.011
    79371.874604 [0011]  <idle>                    1.148      0.000      0.035
    79371.874723 [0005]  <idle>                    0.016      0.000      1.383
    79371.874746 [0005]  gcc[31949]                0.153      0.078      0.022
...

Times are in msec.usec.

Committer note:

Add above explanation as the 'perf sched timehist' entry for 'man
perf-sched'.

Signed-off-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarNamhyung Kim <namhyung@kernel.org>
Acked-by: default avatarIngo Molnar <mingo@kernel.org>
Acked-by: default avatarJiri Olsa <jolsa@kernel.org>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/20161116060634.28477-4-namhyung@kernel.org


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 69b7e480
Loading
Loading
Loading
Loading
+48 −2
Original line number Diff line number Diff line
@@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
SYNOPSIS
--------
[verse]
'perf sched' {record|latency|map|replay|script}
'perf sched' {record|latency|map|replay|script|timehist}

DESCRIPTION
-----------
There are five variants of perf sched:
There are several variants of 'perf sched':

  'perf sched record <command>' to record the scheduling events
  of an arbitrary workload.
@@ -36,6 +36,30 @@ There are five variants of perf sched:
  are running on a CPU. A '*' denotes the CPU that had the event, and
  a dot signals an idle CPU.

  'perf sched timehist' provides an analysis of scheduling events.
    
    Example usage:
        perf sched record -- sleep 1
        perf sched timehist
    
   By default it shows the individual schedule events, including the wait
   time (time between sched-out and next sched-in events for the task), the
   task scheduling delay (time between wakeup and actually running) and run
   time for the task:
    
                time    cpu  task name             wait time  sch delay   run time
                             [tid/pid]                (msec)     (msec)     (msec)
      -------------- ------  --------------------  ---------  ---------  ---------
        79371.874569 [0011]  gcc[31949]                0.014      0.000      1.148
        79371.874591 [0010]  gcc[31951]                0.000      0.000      0.024
        79371.874603 [0010]  migration/10[59]          3.350      0.004      0.011
        79371.874604 [0011]  <idle>                    1.148      0.000      0.035
        79371.874723 [0005]  <idle>                    0.016      0.000      1.383
        79371.874746 [0005]  gcc[31949]                0.153      0.078      0.022
    ...
    
   Times are in msec.usec.

OPTIONS
-------
-i::
@@ -66,6 +90,28 @@ OPTIONS for 'perf sched map'
--color-pids::
	Highlight the given pids.

OPTIONS for 'perf sched timehist'
---------------------------------
-k::
--vmlinux=<file>::
    vmlinux pathname

--kallsyms=<file>::
    kallsyms pathname

-s::
--summary::
    Show only a summary of scheduling by thread with min, max, and average
    run times (in sec) and relative stddev.

-S::
--with-summary::
    Show all scheduling events followed by a summary by thread with min,
    max, and average run times (in sec) and relative stddev.

--symfs=<directory>::
    Look for files with symbols relative to this directory.

SEE ALSO
--------
linkperf:perf-record[1]
+589 −5
Original line number Diff line number Diff line
@@ -13,12 +13,14 @@
#include "util/cloexec.h"
#include "util/thread_map.h"
#include "util/color.h"
#include "util/stat.h"

#include <subcmd/parse-options.h>
#include "util/trace-event.h"

#include "util/debug.h"

#include <linux/log2.h>
#include <sys/prctl.h>
#include <sys/resource.h>

@@ -194,6 +196,29 @@ struct perf_sched {
	struct perf_sched_map map;
};

/* per thread run time data */
struct thread_runtime {
	u64 last_time;      /* time of previous sched in/out event */
	u64 dt_run;         /* run time */
	u64 dt_wait;        /* time between CPU access (off cpu) */
	u64 dt_delay;       /* time between wakeup and sched-in */
	u64 ready_to_run;   /* time of wakeup */

	struct stats run_stats;
	u64 total_run_time;
};

/* per event run time data */
struct evsel_runtime {
	u64 *last_time; /* time this event was last seen per cpu */
	u32 ncpu;       /* highest cpu slot allocated */
};

/* track idle times per cpu */
static struct thread **idle_threads;
static int idle_max_cpu;
static char idle_comm[] = "<idle>";

static u64 get_nsecs(void)
{
	struct timespec ts;
@@ -1654,6 +1679,546 @@ static int perf_sched__read_events(struct perf_sched *sched)
	return rc;
}

/*
 * scheduling times are printed as msec.usec
 */
static inline void print_sched_time(unsigned long long nsecs, int width)
{
	unsigned long msecs;
	unsigned long usecs;

	msecs  = nsecs / NSEC_PER_MSEC;
	nsecs -= msecs * NSEC_PER_MSEC;
	usecs  = nsecs / NSEC_PER_USEC;
	printf("%*lu.%03lu ", width, msecs, usecs);
}

/*
 * returns runtime data for event, allocating memory for it the
 * first time it is used.
 */
static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel)
{
	struct evsel_runtime *r = evsel->priv;

	if (r == NULL) {
		r = zalloc(sizeof(struct evsel_runtime));
		evsel->priv = r;
	}

	return r;
}

/*
 * save last time event was seen per cpu
 */
static void perf_evsel__save_time(struct perf_evsel *evsel,
				  u64 timestamp, u32 cpu)
{
	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);

	if (r == NULL)
		return;

	if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
		int i, n = __roundup_pow_of_two(cpu+1);
		void *p = r->last_time;

		p = realloc(r->last_time, n * sizeof(u64));
		if (!p)
			return;

		r->last_time = p;
		for (i = r->ncpu; i < n; ++i)
			r->last_time[i] = (u64) 0;

		r->ncpu = n;
	}

	r->last_time[cpu] = timestamp;
}

/* returns last time this event was seen on the given cpu */
static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
{
	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);

	if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
		return 0;

	return r->last_time[cpu];
}

static int comm_width = 20;

static char *timehist_get_commstr(struct thread *thread)
{
	static char str[32];
	const char *comm = thread__comm_str(thread);
	pid_t tid = thread->tid;
	pid_t pid = thread->pid_;
	int n;

	if (pid == 0)
		n = scnprintf(str, sizeof(str), "%s", comm);

	else if (tid != pid)
		n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);

	else
		n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);

	if (n > comm_width)
		comm_width = n;

	return str;
}

static void timehist_header(void)
{
	printf("%15s %6s ", "time", "cpu");

	printf(" %-20s  %9s  %9s  %9s",
		"task name", "wait time", "sch delay", "run time");

	printf("\n");

	/*
	 * units row
	 */
	printf("%15s %-6s ", "", "");

	printf(" %-20s  %9s  %9s  %9s\n", "[tid/pid]", "(msec)", "(msec)", "(msec)");

	/*
	 * separator
	 */
	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);

	printf(" %.20s  %.9s  %.9s  %.9s",
		graph_dotted_line, graph_dotted_line, graph_dotted_line,
		graph_dotted_line);

	printf("\n");
}

static void timehist_print_sample(struct perf_sample *sample,
				  struct thread *thread)
{
	struct thread_runtime *tr = thread__priv(thread);
	char tstr[64];

	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
	printf("%15s [%04d] ", tstr, sample->cpu);

	printf(" %-*s ", comm_width, timehist_get_commstr(thread));

	print_sched_time(tr->dt_wait, 6);
	print_sched_time(tr->dt_delay, 6);
	print_sched_time(tr->dt_run, 6);
	printf("\n");
}

/*
 * Explanation of delta-time stats:
 *
 *            t = time of current schedule out event
 *        tprev = time of previous sched out event
 *                also time of schedule-in event for current task
 *    last_time = time of last sched change event for current task
 *                (i.e, time process was last scheduled out)
 * ready_to_run = time of wakeup for current task
 *
 * -----|------------|------------|------------|------
 *    last         ready        tprev          t
 *    time         to run
 *
 *      |-------- dt_wait --------|
 *                   |- dt_delay -|-- dt_run --|
 *
 *   dt_run = run time of current task
 *  dt_wait = time between last schedule out event for task and tprev
 *            represents time spent off the cpu
 * dt_delay = time between wakeup and schedule-in of task
 */

static void timehist_update_runtime_stats(struct thread_runtime *r,
					 u64 t, u64 tprev)
{
	r->dt_delay   = 0;
	r->dt_wait    = 0;
	r->dt_run     = 0;
	if (tprev) {
		r->dt_run = t - tprev;
		if (r->ready_to_run) {
			if (r->ready_to_run > tprev)
				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
			else
				r->dt_delay = tprev - r->ready_to_run;
		}

		if (r->last_time > tprev)
			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
		else if (r->last_time)
			r->dt_wait = tprev - r->last_time;
	}

	update_stats(&r->run_stats, r->dt_run);
	r->total_run_time += r->dt_run;
}

static bool is_idle_sample(struct perf_sample *sample,
			   struct perf_evsel *evsel)
{
	/* pid 0 == swapper == idle task */
	if (sample->pid == 0)
		return true;

	if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) {
		if (perf_evsel__intval(evsel, sample, "prev_pid") == 0)
			return true;
	}
	return false;
}

/*
 * Track idle stats per cpu by maintaining a local thread
 * struct for the idle task on each cpu.
 */
static int init_idle_threads(int ncpu)
{
	int i;

	idle_threads = zalloc(ncpu * sizeof(struct thread *));
	if (!idle_threads)
		return -ENOMEM;

	idle_max_cpu = ncpu - 1;

	/* allocate the actual thread struct if needed */
	for (i = 0; i < ncpu; ++i) {
		idle_threads[i] = thread__new(0, 0);
		if (idle_threads[i] == NULL)
			return -ENOMEM;

		thread__set_comm(idle_threads[i], idle_comm, 0);
	}

	return 0;
}

static void free_idle_threads(void)
{
	int i;

	if (idle_threads == NULL)
		return;

	for (i = 0; i <= idle_max_cpu; ++i) {
		if ((idle_threads[i]))
			thread__delete(idle_threads[i]);
	}

	free(idle_threads);
}

static struct thread *get_idle_thread(int cpu)
{
	/*
	 * expand/allocate array of pointers to local thread
	 * structs if needed
	 */
	if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
		int i, j = __roundup_pow_of_two(cpu+1);
		void *p;

		p = realloc(idle_threads, j * sizeof(struct thread *));
		if (!p)
			return NULL;

		idle_threads = (struct thread **) p;
		i = idle_max_cpu ? idle_max_cpu + 1 : 0;
		for (; i < j; ++i)
			idle_threads[i] = NULL;

		idle_max_cpu = j;
	}

	/* allocate a new thread struct if needed */
	if (idle_threads[cpu] == NULL) {
		idle_threads[cpu] = thread__new(0, 0);

		if (idle_threads[cpu]) {
			idle_threads[cpu]->tid = 0;
			thread__set_comm(idle_threads[cpu], idle_comm, 0);
		}
	}

	return idle_threads[cpu];
}

/*
 * handle runtime stats saved per thread
 */
static struct thread_runtime *thread__init_runtime(struct thread *thread)
{
	struct thread_runtime *r;

	r = zalloc(sizeof(struct thread_runtime));
	if (!r)
		return NULL;

	init_stats(&r->run_stats);
	thread__set_priv(thread, r);

	return r;
}

static struct thread_runtime *thread__get_runtime(struct thread *thread)
{
	struct thread_runtime *tr;

	tr = thread__priv(thread);
	if (tr == NULL) {
		tr = thread__init_runtime(thread);
		if (tr == NULL)
			pr_debug("Failed to malloc memory for runtime data.\n");
	}

	return tr;
}

static struct thread *timehist_get_thread(struct perf_sample *sample,
					  struct machine *machine,
					  struct perf_evsel *evsel)
{
	struct thread *thread;

	if (is_idle_sample(sample, evsel)) {
		thread = get_idle_thread(sample->cpu);
		if (thread == NULL)
			pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);

	} else {
		thread = machine__findnew_thread(machine, sample->pid, sample->tid);
		if (thread == NULL) {
			pr_debug("Failed to get thread for tid %d. skipping sample.\n",
				 sample->tid);
		}
	}

	return thread;
}

static bool timehist_skip_sample(struct thread *thread)
{
	bool rc = false;

	if (thread__is_filtered(thread))
		rc = true;

	return rc;
}

static int timehist_sched_wakeup_event(struct perf_tool *tool __maybe_unused,
				       union perf_event *event __maybe_unused,
				       struct perf_evsel *evsel,
				       struct perf_sample *sample,
				       struct machine *machine)
{
	struct thread *thread;
	struct thread_runtime *tr = NULL;
	/* want pid of awakened task not pid in sample */
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");

	thread = machine__findnew_thread(machine, 0, pid);
	if (thread == NULL)
		return -1;

	tr = thread__get_runtime(thread);
	if (tr == NULL)
		return -1;

	if (tr->ready_to_run == 0)
		tr->ready_to_run = sample->time;

	return 0;
}

static int timehist_sched_change_event(struct perf_tool *tool __maybe_unused,
				       union perf_event *event,
				       struct perf_evsel *evsel,
				       struct perf_sample *sample,
				       struct machine *machine)
{
	struct addr_location al;
	struct thread *thread;
	struct thread_runtime *tr = NULL;
	u64 tprev;
	int rc = 0;

	if (machine__resolve(machine, &al, sample) < 0) {
		pr_err("problem processing %d event. skipping it\n",
		       event->header.type);
		rc = -1;
		goto out;
	}

	thread = timehist_get_thread(sample, machine, evsel);
	if (thread == NULL) {
		rc = -1;
		goto out;
	}

	if (timehist_skip_sample(thread))
		goto out;

	tr = thread__get_runtime(thread);
	if (tr == NULL) {
		rc = -1;
		goto out;
	}

	tprev = perf_evsel__get_time(evsel, sample->cpu);

	timehist_update_runtime_stats(tr, sample->time, tprev);
	timehist_print_sample(sample, thread);

out:
	if (tr) {
		/* time of this sched_switch event becomes last time task seen */
		tr->last_time = sample->time;

		/* sched out event for task so reset ready to run time */
		tr->ready_to_run = 0;
	}

	perf_evsel__save_time(evsel, sample->time, sample->cpu);

	return rc;
}

static int timehist_sched_switch_event(struct perf_tool *tool,
			     union perf_event *event,
			     struct perf_evsel *evsel,
			     struct perf_sample *sample,
			     struct machine *machine __maybe_unused)
{
	return timehist_sched_change_event(tool, event, evsel, sample, machine);
}

static int process_lost(struct perf_tool *tool __maybe_unused,
			union perf_event *event,
			struct perf_sample *sample,
			struct machine *machine __maybe_unused)
{
	char tstr[64];

	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
	printf("%15s ", tstr);
	printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);

	return 0;
}


typedef int (*sched_handler)(struct perf_tool *tool,
			  union perf_event *event,
			  struct perf_evsel *evsel,
			  struct perf_sample *sample,
			  struct machine *machine);

static int perf_timehist__process_sample(struct perf_tool *tool,
					 union perf_event *event,
					 struct perf_sample *sample,
					 struct perf_evsel *evsel,
					 struct machine *machine)
{
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
	int err = 0;
	int this_cpu = sample->cpu;

	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;

	if (evsel->handler != NULL) {
		sched_handler f = evsel->handler;

		err = f(tool, event, evsel, sample, machine);
	}

	return err;
}

static int perf_sched__timehist(struct perf_sched *sched)
{
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",       timehist_sched_switch_event, },
		{ "sched:sched_wakeup",	      timehist_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
	};
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
	};

	struct perf_session *session;
	int err = -1;

	/*
	 * event handlers for timehist option
	 */
	sched->tool.sample	 = perf_timehist__process_sample;
	sched->tool.mmap	 = perf_event__process_mmap;
	sched->tool.comm	 = perf_event__process_comm;
	sched->tool.exit	 = perf_event__process_exit;
	sched->tool.fork	 = perf_event__process_fork;
	sched->tool.lost	 = process_lost;
	sched->tool.attr	 = perf_event__process_attr;
	sched->tool.tracing_data = perf_event__process_tracing_data;
	sched->tool.build_id	 = perf_event__process_build_id;

	sched->tool.ordered_events = true;
	sched->tool.ordering_requires_timestamps = true;

	session = perf_session__new(&file, false, &sched->tool);
	if (session == NULL)
		return -ENOMEM;

	symbol__init(&session->header.env);

	setup_pager();

	/* setup per-evsel handlers */
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out;

	if (!perf_session__has_traces(session, "record -R"))
		goto out;

	/* pre-allocate struct for per-CPU idle stats */
	sched->max_cpu = session->header.env.nr_cpus_online;
	if (sched->max_cpu == 0)
		sched->max_cpu = 4;
	if (init_idle_threads(sched->max_cpu))
		goto out;

	timehist_header();

	err = perf_session__process_events(session);
	if (err) {
		pr_err("Failed to process events, error %d", err);
		goto out;
	}

out:
	free_idle_threads();
	perf_session__delete(session);

	return err;
}


static void print_bad_events(struct perf_sched *sched)
{
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
@@ -1970,8 +2535,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
@@ -1983,8 +2546,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
@@ -2001,6 +2562,16 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                    "display given CPUs in map"),
	OPT_PARENT(sched_options)
	};
	const struct option timehist_options[] = {
	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
		   "file", "vmlinux pathname"),
	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
		   "file", "kallsyms pathname"),
	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
		    "Look for files with symbols relative to this directory"),
	OPT_PARENT(sched_options)
	};

	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
@@ -2013,8 +2584,13 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
		"perf sched map [<options>]",
		NULL
	};
	const char * const timehist_usage[] = {
		"perf sched timehist [<options>]",
		NULL
	};
	const char *const sched_subcommands[] = { "record", "latency", "map",
						  "replay", "script", NULL };
						  "replay", "script",
						  "timehist", NULL };
	const char *sched_usage[] = {
		NULL,
		NULL
@@ -2077,6 +2653,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
				usage_with_options(replay_usage, replay_options);
		}
		return perf_sched__replay(&sched);
	} else if (!strcmp(argv[0], "timehist")) {
		if (argc) {
			argc = parse_options(argc, argv, timehist_options,
					     timehist_usage, 0);
			if (argc)
				usage_with_options(timehist_usage, timehist_options);
		}
		return perf_sched__timehist(&sched);
	} else {
		usage_with_options(sched_usage, sched_options);
	}