Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 86c27869 authored by Adrian Hunter's avatar Adrian Hunter Committed by Arnaldo Carvalho de Melo
Browse files

perf intel-pt: Add support for PERF_RECORD_SWITCH



Add support for selecting and processing PERF_RECORD_SWITCH events for
use by Intel PT.  If they are available, they will be used in preference
to sched_switch events.

This enables an unprivileged user to trace multi-threaded or
multi-process workloads with any level of perf_event_paranoid.  However
it depends on kernel support for PERF_RECORD_SWITCH.

Without this patch, tracing a multi-threaded workload will decode
without error but all the data will be attributed to the main thread.

Without this patch, tracing a multi-process workload will result in
decoder errors because the decoder will not know which executable is
executing.

Signed-off-by: default avatarAdrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1439458857-30636-3-git-send-email-adrian.hunter@intel.com


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 1b29ac59
Loading
Loading
Loading
Loading
+47 −8
Original line number Diff line number Diff line
@@ -624,6 +624,41 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
	 * threads.
	 */
	if (have_timing_info && !cpu_map__empty(cpus)) {
		if (perf_can_record_switch_events()) {
			bool cpu_wide = !target__none(&opts->target) &&
					!target__has_task(&opts->target);

			if (!cpu_wide && perf_can_record_cpu_wide()) {
				struct perf_evsel *switch_evsel;

				err = parse_events(evlist, "dummy:u", NULL);
				if (err)
					return err;

				switch_evsel = perf_evlist__last(evlist);

				switch_evsel->attr.freq = 0;
				switch_evsel->attr.sample_period = 1;
				switch_evsel->attr.context_switch = 1;

				switch_evsel->system_wide = true;
				switch_evsel->no_aux_samples = true;
				switch_evsel->immediate = true;

				perf_evsel__set_sample_bit(switch_evsel, TID);
				perf_evsel__set_sample_bit(switch_evsel, TIME);
				perf_evsel__set_sample_bit(switch_evsel, CPU);

				opts->record_switch_events = false;
				ptr->have_sched_switch = 3;
			} else {
				opts->record_switch_events = true;
				if (cpu_wide)
					ptr->have_sched_switch = 3;
				else
					ptr->have_sched_switch = 2;
			}
		} else {
			err = intel_pt_track_switches(evlist);
			if (err == -EPERM)
				pr_debug2("Unable to select sched:sched_switch\n");
@@ -632,6 +667,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
			else
				ptr->have_sched_switch = 1;
		}
	}

	if (intel_pt_evsel) {
		/*
@@ -663,8 +699,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
		tracking_evsel->attr.sample_period = 1;

		/* In per-cpu case, always need the time of mmap events etc */
		if (!cpu_map__empty(cpus))
		if (!cpu_map__empty(cpus)) {
			perf_evsel__set_sample_bit(tracking_evsel, TIME);
			/* And the CPU for switch events */
			perf_evsel__set_sample_bit(tracking_evsel, CPU);
		}
	}

	/*
+104 −25
Original line number Diff line number Diff line
@@ -1145,11 +1145,13 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
	return 0;
}

static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
{
	struct machine *machine = pt->machine;
	struct map *map;
	struct symbol *sym, *start;
	u64 ip, switch_ip = 0;
	const char *ptss;

	if (ptss_ip)
		*ptss_ip = 0;
@@ -1177,8 +1179,13 @@ static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
	if (!switch_ip || !ptss_ip)
		return 0;

	if (pt->have_sched_switch == 1)
		ptss = "perf_trace_sched_switch";
	else
		ptss = "__perf_event_task_sched_out";

	for (sym = start; sym; sym = dso__next_symbol(sym)) {
		if (!strcmp(sym->name, "perf_trace_sched_switch")) {
		if (!strcmp(sym->name, ptss)) {
			ip = map->unmap_ip(map, sym->start);
			if (ip >= map->start && ip < map->end) {
				*ptss_ip = ip;
@@ -1198,11 +1205,11 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)

	if (!pt->kernel_start) {
		pt->kernel_start = machine__kernel_start(pt->machine);
		if (pt->per_cpu_mmaps && pt->have_sched_switch &&
		if (pt->per_cpu_mmaps &&
		    (pt->have_sched_switch == 1 || pt->have_sched_switch == 3) &&
		    !pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
		    !pt->sampling_mode) {
			pt->switch_ip = intel_pt_switch_ip(pt->machine,
							   &pt->ptss_ip);
			pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip);
			if (pt->switch_ip) {
				intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
					     pt->switch_ip, pt->ptss_ip);
@@ -1387,31 +1394,18 @@ static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
	return NULL;
}

static int intel_pt_process_switch(struct intel_pt *pt,
				   struct perf_sample *sample)
static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
				u64 timestamp)
{
	struct intel_pt_queue *ptq;
	struct perf_evsel *evsel;
	pid_t tid;
	int cpu, err;

	evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
	if (evsel != pt->switch_evsel)
		return 0;

	tid = perf_evsel__intval(evsel, sample, "next_pid");
	cpu = sample->cpu;

	intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
		     cpu, tid, sample->time, perf_time_to_tsc(sample->time,
		     &pt->tc));
	int err;

	if (!pt->sync_switch)
		goto out;
		return 1;

	ptq = intel_pt_cpu_to_ptq(pt, cpu);
	if (!ptq)
		goto out;
		return 1;

	switch (ptq->switch_state) {
	case INTEL_PT_SS_NOT_TRACING:
@@ -1424,7 +1418,7 @@ static int intel_pt_process_switch(struct intel_pt *pt,
		return 0;
	case INTEL_PT_SS_EXPECTING_SWITCH_EVENT:
		if (!ptq->on_heap) {
			ptq->timestamp = perf_time_to_tsc(sample->time,
			ptq->timestamp = perf_time_to_tsc(timestamp,
							  &pt->tc);
			err = auxtrace_heap__add(&pt->heap, ptq->queue_nr,
						 ptq->timestamp);
@@ -1441,10 +1435,76 @@ static int intel_pt_process_switch(struct intel_pt *pt,
	default:
		break;
	}
out:

	return 1;
}

static int intel_pt_process_switch(struct intel_pt *pt,
				   struct perf_sample *sample)
{
	struct perf_evsel *evsel;
	pid_t tid;
	int cpu, ret;

	evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
	if (evsel != pt->switch_evsel)
		return 0;

	tid = perf_evsel__intval(evsel, sample, "next_pid");
	cpu = sample->cpu;

	intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
		     cpu, tid, sample->time, perf_time_to_tsc(sample->time,
		     &pt->tc));

	ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
	if (ret <= 0)
		return ret;

	return machine__set_current_tid(pt->machine, cpu, -1, tid);
}

static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
				   struct perf_sample *sample)
{
	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
	pid_t pid, tid;
	int cpu, ret;

	cpu = sample->cpu;

	if (pt->have_sched_switch == 3) {
		if (!out)
			return 0;
		if (event->header.type != PERF_RECORD_SWITCH_CPU_WIDE) {
			pr_err("Expecting CPU-wide context switch event\n");
			return -EINVAL;
		}
		pid = event->context_switch.next_prev_pid;
		tid = event->context_switch.next_prev_tid;
	} else {
		if (out)
			return 0;
		pid = sample->pid;
		tid = sample->tid;
	}

	if (tid == -1) {
		pr_err("context_switch event has no tid\n");
		return -EINVAL;
	}

	intel_pt_log("context_switch: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
		     cpu, pid, tid, sample->time, perf_time_to_tsc(sample->time,
		     &pt->tc));

	ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
	if (ret <= 0)
		return ret;

	return machine__set_current_tid(pt->machine, cpu, pid, tid);
}

static int intel_pt_process_itrace_start(struct intel_pt *pt,
					 union perf_event *event,
					 struct perf_sample *sample)
@@ -1515,6 +1575,9 @@ static int intel_pt_process_event(struct perf_session *session,
		err = intel_pt_process_switch(pt, sample);
	else if (event->header.type == PERF_RECORD_ITRACE_START)
		err = intel_pt_process_itrace_start(pt, event, sample);
	else if (event->header.type == PERF_RECORD_SWITCH ||
		 event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
		err = intel_pt_context_switch(pt, event, sample);

	intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n",
		     perf_event__name(event->header.type), event->header.type,
@@ -1777,6 +1840,18 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist)
	return NULL;
}

static bool intel_pt_find_switch(struct perf_evlist *evlist)
{
	struct perf_evsel *evsel;

	evlist__for_each(evlist, evsel) {
		if (evsel->attr.context_switch)
			return true;
	}

	return false;
}

static const char * const intel_pt_info_fmts[] = {
	[INTEL_PT_PMU_TYPE]		= "  PMU Type            %"PRId64"\n",
	[INTEL_PT_TIME_SHIFT]		= "  Time Shift          %"PRIu64"\n",
@@ -1888,6 +1963,10 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
			pr_err("%s: missing sched_switch event\n", __func__);
			goto err_delete_thread;
		}
	} else if (pt->have_sched_switch == 2 &&
		   !intel_pt_find_switch(session->evlist)) {
		pr_err("%s: missing context_switch attribute flag\n", __func__);
		goto err_delete_thread;
	}

	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {