Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4b0c53e9 authored by Ingo Molnar's avatar Ingo Molnar
Browse files

Merge tag 'perf-core-for-mingo' of...

Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux

 into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

New features:

  - Introduce PERF_RECORD_SWITCH(_CPU_WIDE) and use it in 'record' to
    ask for context switches, allowing non priviledged tasks to know
    when they are switched in and out, which wasn't possible with
    the other context switch tracepoint and software events, see the
    patch description for a comprehensive justification (Adrian Hunter)

  - Stop collecting /proc/kallsyms in perf.data files, saving about
    4.5MB on a typical x86-64 system, use the the symbol resolution
    routines used in all the other tools (report, top, etc) now that
    we can ask libtraceevent to use perf's symbol resolution code.
    (Arnaldo Carvalho de Melo)

User visible fixes:

  - Expose perf's symbol resolver to libtraceevent, so that its plugins can
    resolve tracepoint fields to kernel functions, like the 'function' field
    in the "timer:hrtimer_start tracepoint" (Arnaldo Carvalho de Melo)

Infrastructure changes:

  - Map propagation of thread and cpu maps improvements, prep work for
    'perf stat' new features (Jiri Olsa)

Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents a11c51ac 7c14898b
Loading
Loading
Loading
Loading
+30 −1
Original line number Diff line number Diff line
@@ -330,7 +330,8 @@ struct perf_event_attr {
				mmap2          :  1, /* include mmap with inode data     */
				comm_exec      :  1, /* flag comm events that are due to an exec */
				use_clockid    :  1, /* use @clockid for time fields */
				__reserved_1   : 38;
				context_switch :  1, /* context switch data */
				__reserved_1   : 37;

	union {
		__u32		wakeup_events;	  /* wakeup every n events */
@@ -572,9 +573,11 @@ struct perf_event_mmap_page {
/*
 * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
 * different events so can reuse the same bit position.
 * Ditto PERF_RECORD_MISC_SWITCH_OUT.
 */
#define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
#define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
#define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
/*
 * Indicates that the content of PERF_SAMPLE_IP points to
 * the actual instruction that triggered the event. See also
@@ -818,6 +821,32 @@ enum perf_event_type {
	 */
	PERF_RECORD_LOST_SAMPLES		= 13,

	/*
	 * Records a context switch in or out (flagged by
	 * PERF_RECORD_MISC_SWITCH_OUT). See also
	 * PERF_RECORD_SWITCH_CPU_WIDE.
	 *
	 * struct {
	 *	struct perf_event_header	header;
	 *	struct sample_id		sample_id;
	 * };
	 */
	PERF_RECORD_SWITCH			= 14,

	/*
	 * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
	 * next_prev_tid that are the next (switching out) or previous
	 * (switching in) pid/tid.
	 *
	 * struct {
	 *	struct perf_event_header	header;
	 *	u32				next_prev_pid;
	 *	u32				next_prev_tid;
	 *	struct sample_id		sample_id;
	 * };
	 */
	PERF_RECORD_SWITCH_CPU_WIDE		= 15,

	PERF_RECORD_MAX,			/* non-ABI */
};

+103 −0
Original line number Diff line number Diff line
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
	local_irq_restore(flags);
}

static void perf_event_switch(struct task_struct *task,
			      struct task_struct *next_prev, bool sched_in);

#define for_each_task_context_nr(ctxn)					\
	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
	if (__this_cpu_read(perf_sched_cb_usages))
		perf_pmu_sched_task(task, next, false);

	if (atomic_read(&nr_switch_events))
		perf_event_switch(task, next, false);

	for_each_task_context_nr(ctxn)
		perf_event_context_sched_out(task, ctxn, next);

@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
		perf_cgroup_sched_in(prev, task);

	if (atomic_read(&nr_switch_events))
		perf_event_switch(task, prev, true);

	if (__this_cpu_read(perf_sched_cb_usages))
		perf_pmu_sched_task(prev, task, true);
}
@@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event)
		atomic_dec(&nr_task_events);
	if (event->attr.freq)
		atomic_dec(&nr_freq_events);
	if (event->attr.context_switch) {
		static_key_slow_dec_deferred(&perf_sched_events);
		atomic_dec(&nr_switch_events);
	}
	if (is_cgroup_event(event))
		static_key_slow_dec_deferred(&perf_sched_events);
	if (has_branch_stack(event))
@@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
	perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
	struct task_struct	*task;
	struct task_struct	*next_prev;

	struct {
		struct perf_event_header	header;
		u32				next_prev_pid;
		u32				next_prev_tid;
	} event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
	return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
	struct perf_switch_event *se = data;
	struct perf_output_handle handle;
	struct perf_sample_data sample;
	int ret;

	if (!perf_event_switch_match(event))
		return;

	/* Only CPU-wide events are allowed to see next/prev pid/tid */
	if (event->ctx->task) {
		se->event_id.header.type = PERF_RECORD_SWITCH;
		se->event_id.header.size = sizeof(se->event_id.header);
	} else {
		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
		se->event_id.header.size = sizeof(se->event_id);
		se->event_id.next_prev_pid =
					perf_event_pid(event, se->next_prev);
		se->event_id.next_prev_tid =
					perf_event_tid(event, se->next_prev);
	}

	perf_event_header__init_id(&se->event_id.header, &sample, event);

	ret = perf_output_begin(&handle, event, se->event_id.header.size);
	if (ret)
		return;

	if (event->ctx->task)
		perf_output_put(&handle, se->event_id.header);
	else
		perf_output_put(&handle, se->event_id);

	perf_event__output_id_sample(event, &handle, &sample);

	perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
			      struct task_struct *next_prev, bool sched_in)
{
	struct perf_switch_event switch_event;

	/* N.B. caller checks nr_switch_events != 0 */

	switch_event = (struct perf_switch_event){
		.task		= task,
		.next_prev	= next_prev,
		.event_id	= {
			.header = {
				/* .type */
				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
				/* .size */
			},
			/* .next_prev_pid */
			/* .next_prev_tid */
		},
	};

	perf_event_aux(perf_event_switch_output,
		       &switch_event,
		       NULL);
}

/*
 * IRQ throttle logging
 */
@@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event)
		if (atomic_inc_return(&nr_freq_events) == 1)
			tick_nohz_full_kick_all();
	}
	if (event->attr.context_switch) {
		atomic_inc(&nr_switch_events);
		static_key_slow_inc(&perf_sched_events.key);
	}
	if (has_branch_stack(event))
		static_key_slow_inc(&perf_sched_events.key);
	if (is_cgroup_event(event))
+67 −1
Original line number Diff line number Diff line
@@ -418,7 +418,7 @@ static int func_map_init(struct pevent *pevent)
}

static struct func_map *
find_func(struct pevent *pevent, unsigned long long addr)
__find_func(struct pevent *pevent, unsigned long long addr)
{
	struct func_map *func;
	struct func_map key;
@@ -434,6 +434,71 @@ find_func(struct pevent *pevent, unsigned long long addr)
	return func;
}

struct func_resolver {
	pevent_func_resolver_t *func;
	void		       *priv;
	struct func_map	       map;
};

/**
 * pevent_set_function_resolver - set an alternative function resolver
 * @pevent: handle for the pevent
 * @resolver: function to be used
 * @priv: resolver function private state.
 *
 * Some tools may have already a way to resolve kernel functions, allow them to
 * keep using it instead of duplicating all the entries inside
 * pevent->funclist.
 */
int pevent_set_function_resolver(struct pevent *pevent,
				 pevent_func_resolver_t *func, void *priv)
{
	struct func_resolver *resolver = malloc(sizeof(*resolver));

	if (resolver == NULL)
		return -1;

	resolver->func = func;
	resolver->priv = priv;

	free(pevent->func_resolver);
	pevent->func_resolver = resolver;

	return 0;
}

/**
 * pevent_reset_function_resolver - reset alternative function resolver
 * @pevent: handle for the pevent
 *
 * Stop using whatever alternative resolver was set, use the default
 * one instead.
 */
void pevent_reset_function_resolver(struct pevent *pevent)
{
	free(pevent->func_resolver);
	pevent->func_resolver = NULL;
}

static struct func_map *
find_func(struct pevent *pevent, unsigned long long addr)
{
	struct func_map *map;

	if (!pevent->func_resolver)
		return __find_func(pevent, addr);

	map = &pevent->func_resolver->map;
	map->mod  = NULL;
	map->addr = addr;
	map->func = pevent->func_resolver->func(pevent->func_resolver->priv,
						&map->addr, &map->mod);
	if (map->func == NULL)
		return NULL;

	return map;
}

/**
 * pevent_find_function - find a function by a given address
 * @pevent: handle for the pevent
@@ -6564,6 +6629,7 @@ void pevent_free(struct pevent *pevent)
	free(pevent->trace_clock);
	free(pevent->events);
	free(pevent->sort_events);
	free(pevent->func_resolver);

	free(pevent);
}
+8 −0
Original line number Diff line number Diff line
@@ -453,6 +453,10 @@ struct cmdline_list;
struct func_map;
struct func_list;
struct event_handler;
struct func_resolver;

typedef char *(pevent_func_resolver_t)(void *priv,
				       unsigned long long *addrp, char **modp);

struct pevent {
	int ref_count;
@@ -481,6 +485,7 @@ struct pevent {
	int cmdline_count;

	struct func_map *func_map;
	struct func_resolver *func_resolver;
	struct func_list *funclist;
	unsigned int func_count;

@@ -611,6 +616,9 @@ enum trace_flag_type {
	TRACE_FLAG_SOFTIRQ		= 0x10,
};

int pevent_set_function_resolver(struct pevent *pevent,
				 pevent_func_resolver_t *func, void *priv);
void pevent_reset_function_resolver(struct pevent *pevent);
int pevent_register_comm(struct pevent *pevent, const char *comm, int pid);
int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock);
int pevent_register_function(struct pevent *pevent, char *name,
+4 −0
Original line number Diff line number Diff line
@@ -293,6 +293,10 @@ When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
because the file may be huge. A time out is needed in such cases.
This option sets the time out limit. The default value is 500 ms.

--switch-events::
Record context switch events i.e. events of type PERF_RECORD_SWITCH or
PERF_RECORD_SWITCH_CPU_WIDE.

SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
Loading