Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 80c804bf authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'bpf-get-stackid'

Alexei Starovoitov says:

====================
bpf_get_stackid() and stack_trace map

This patch set introduces new map type to store stack traces and
corresponding bpf_get_stackid() helper.
BPF programs already can walk the stack via unrolled loop
of bpf_probe_read()s which is ok for simple analysis, but it's
not efficient and limited to <30 frames after that the programs
don't fit into MAX_BPF_STACK. With bpf_get_stackid() helper
the programs can collect up to PERF_MAX_STACK_DEPTH both
user and kernel frames.
Using stack traces as a key in a map turned out to be very useful
for generating flame graphs, off-cpu graphs, waker and chain graphs.
Patch 3 is a simplified version of 'offwaketime' tool which is
described in detail here:
http://brendangregg.com/blog/2016-02-01/linux-wakeup-offwake-profiling.html



Earlier version of this patch were using save_stack_trace() helper,
but 'unreliable' frames add to much noise and two equiavlent
stack traces produce different 'stackid's.
Using lockdep style of storing frames with MAX_STACK_TRACE_ENTRIES is
great for lockdep, but not acceptable for bpf, since the stack_trace
map needs to be freed when user Ctrl-C the tool.
The ftrace style with per_cpu(struct ftrace_stack) is great, but it's
tightly coupled with ftrace ring buffer and has the same 'unreliable'
noise. perf_event's perf_callchain() mechanism is also very efficient
and it only needed minor generalization which is done in patch 1
to be used by bpf stack_trace maps.
Peter, please take a look at patch 1.
If you're ok with it, I'd like to take the whole set via net-next.

Patch 1 - generalization of perf_callchain()
Patch 2 - stack_trace map done as lock-less hashtable without link list
  to avoid spinlock on insertion which is critical path when
  bpf_get_stackid() helper is called for every task switch event
Patch 3 - offwaketime example

After the patch the 'perf report' for artificial 'sched_bench'
benchmark that doing pthread_cond_wait/signal and 'offwaketime'
example is running in the background:
 16.35%  swapper      [kernel.vmlinux]    [k] intel_idle
  2.18%  sched_bench  [kernel.vmlinux]    [k] __switch_to
  2.18%  sched_bench  libpthread-2.12.so  [.] pthread_cond_signal@@GLIBC_2.3.2
  1.72%  sched_bench  libpthread-2.12.so  [.] pthread_mutex_unlock
  1.53%  sched_bench  [kernel.vmlinux]    [k] bpf_get_stackid
  1.44%  sched_bench  [kernel.vmlinux]    [k] entry_SYSCALL_64
  1.39%  sched_bench  [kernel.vmlinux]    [k] __call_rcu.constprop.73
  1.13%  sched_bench  libpthread-2.12.so  [.] pthread_mutex_lock
  1.07%  sched_bench  libpthread-2.12.so  [.] pthread_cond_wait@@GLIBC_2.3.2
  1.07%  sched_bench  [kernel.vmlinux]    [k] hash_futex
  1.05%  sched_bench  [kernel.vmlinux]    [k] do_futex
  1.05%  sched_bench  [kernel.vmlinux]    [k] get_futex_key_refs.isra.13

The hotest part of bpf_get_stackid() is inlined jhash2, so we may consider
using some faster hash in the future, but it's good enough for now.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 6b83d28a a6ffe7b9
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */

struct stacktrace_ops {
	void (*address)(void *data, unsigned long address, int reliable);
	int (*address)(void *data, unsigned long address, int reliable);
	/* On negative return stop dumping */
	int (*stack)(void *data, char *name);
	walk_stack_t	walk_stack;
+2 −2
Original line number Diff line number Diff line
@@ -2180,11 +2180,11 @@ static int backtrace_stack(void *data, char *name)
	return 0;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
static int backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	perf_callchain_store(entry, addr);
	return perf_callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
+4 −2
Original line number Diff line number Diff line
@@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
		if (!__kernel_text_address(addr))
			break;

		ops->address(data, addr, 1);
		if (ops->address(data, addr, 1))
			break;
		frame = frame->next_frame;
		ret_addr = &frame->return_address;
		print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
@@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
/*
 * Print one address/symbol entries per line.
 */
static void print_trace_address(void *data, unsigned long addr, int reliable)
static int print_trace_address(void *data, unsigned long addr, int reliable)
{
	touch_nmi_watchdog();
	printk_stack_address(addr, reliable, data);
	return 0;
}

static const struct stacktrace_ops print_trace_ops = {
+11 −7
Original line number Diff line number Diff line
@@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
	return 0;
}

static void
static int
__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
	struct stack_trace *trace = data;
#ifdef CONFIG_FRAME_POINTER
	if (!reliable)
		return;
		return 0;
#endif
	if (nosched && in_sched_functions(addr))
		return;
		return 0;
	if (trace->skip > 0) {
		trace->skip--;
		return;
		return 0;
	}
	if (trace->nr_entries < trace->max_entries)
	if (trace->nr_entries < trace->max_entries) {
		trace->entries[trace->nr_entries++] = addr;
		return 0;
	} else {
		return -1; /* no more room, stop walking the stack */
	}
}

static void save_stack_address(void *data, unsigned long addr, int reliable)
static int save_stack_address(void *data, unsigned long addr, int reliable)
{
	return __save_stack_address(data, addr, reliable, false);
}

static void
static int
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
	return __save_stack_address(data, addr, reliable, true);
+2 −1
Original line number Diff line number Diff line
@@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
	return 0;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
static int backtrace_address(void *data, unsigned long addr, int reliable)
{
	unsigned int *depth = data;

	if ((*depth)--)
		oprofile_add_trace(addr);
	return 0;
}

static struct stacktrace_ops backtrace_ops = {
Loading