Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c195651e authored by Yonghong Song's avatar Yonghong Song Committed by Alexei Starovoitov
Browse files

bpf: add bpf_get_stack helper



Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: default avatarAlexei Starovoitov <ast@fb.com>
Signed-off-by: default avatarYonghong Song <yhs@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent 5f412632
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;

/* Shared helpers among cBPF and eBPF. */
+2 −1
Original line number Diff line number Diff line
@@ -468,7 +468,8 @@ struct bpf_prog {
				dst_needed:1,	/* Do we need dst entry? */
				blinded:1,	/* Was blinded */
				is_func:1,	/* program is a bpf function */
				kprobe_override:1; /* Do we override a kprobe? */
				kprobe_override:1, /* Do we override a kprobe? */
				has_callchain_buf:1; /* callchain buffer allocated? */
	enum bpf_prog_type	type;		/* Type of BPF program */
	enum bpf_attach_type	expected_attach_type; /* For some prog types */
	u32			len;		/* Number of filter blocks */
+40 −2
Original line number Diff line number Diff line
@@ -1767,6 +1767,40 @@ union bpf_attr {
 * 		**CONFIG_XFRM** configuration option.
 * 	Return
 * 		0 on success, or a negative error in case of failure.
 *
 * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
 * 	Description
 *		Return a user or a kernel stack in bpf program provided buffer.
 *		To achieve this, the helper needs *ctx*, which is a pointer
 *		to the context on which the tracing program is executed.
 *		To store the stacktrace, the bpf program provides *buf* with
 *		a nonnegative *size*.
 *
 *		The last argument, *flags*, holds the number of stack frames to
 *		skip (from 0 to 255), masked with
 *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
 *		the following flags:
 *
 *		**BPF_F_USER_STACK**
 *			Collect a user space stack instead of a kernel stack.
 *		**BPF_F_USER_BUILD_ID**
 *			Collect buildid+offset instead of ips for user stack,
 *			only valid if **BPF_F_USER_STACK** is also specified.
 *
 *		**bpf_get_stack**\ () can collect up to
 *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
 *		to sufficient large buffer size. Note that
 *		this limit can be controlled with the **sysctl** program, and
 *		that it should be manually increased in order to profile long
 *		user stacks (such as stacks for Java programs). To do so, use:
 *
 *	::
 *
 *		# sysctl kernel.perf_event_max_stack=<new value>
 *
 * 	Return
 * 		a non-negative value equal to or less than size on success, or
 * 		a negative error in case of failure.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
	FN(msg_pull_data),		\
	FN(bind),			\
	FN(xdp_adjust_tail),		\
	FN(skb_get_xfrm_state),
	FN(skb_get_xfrm_state),		\
	FN(get_stack),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6		(1ULL << 0)

/* BPF_FUNC_get_stackid flags. */
/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
#define BPF_F_SKIP_FIELD_MASK		0xffULL
#define BPF_F_USER_STACK		(1ULL << 8)
/* flags used by BPF_FUNC_get_stackid only. */
#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
#define BPF_F_REUSE_STACKID		(1ULL << 10)
/* flags used by BPF_FUNC_get_stack only. */
#define BPF_F_USER_BUILD_ID		(1ULL << 11)

/* BPF_FUNC_skb_set_tunnel_key flags. */
#define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+5 −0
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>

#include <asm/unaligned.h>

@@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
	aux = container_of(work, struct bpf_prog_aux, work);
	if (bpf_prog_is_dev_bound(aux))
		bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
	if (aux->prog->has_callchain_buf)
		put_callchain_buffers();
#endif
	for (i = 0; i < aux->func_cnt; i++)
		bpf_jit_free(aux->func[i]);
	if (aux->func_cnt) {
+67 −0
Original line number Diff line number Diff line
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
	.arg3_type	= ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
	   u64, flags)
{
	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
	bool user = flags & BPF_F_USER_STACK;
	struct perf_callchain_entry *trace;
	bool kernel = !user;
	int err = -EINVAL;
	u64 *ips;

	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
			       BPF_F_USER_BUILD_ID)))
		goto clear;
	if (kernel && user_build_id)
		goto clear;

	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
					    : sizeof(u64);
	if (unlikely(size % elem_size))
		goto clear;

	num_elem = size / elem_size;
	if (sysctl_perf_event_max_stack < num_elem)
		init_nr = 0;
	else
		init_nr = sysctl_perf_event_max_stack - num_elem;
	trace = get_perf_callchain(regs, init_nr, kernel, user,
				   sysctl_perf_event_max_stack, false, false);
	if (unlikely(!trace))
		goto err_fault;

	trace_nr = trace->nr - init_nr;
	if (trace_nr < skip)
		goto err_fault;

	trace_nr -= skip;
	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
	copy_len = trace_nr * elem_size;
	ips = trace->ip + skip + init_nr;
	if (user && user_build_id)
		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
	else
		memcpy(buf, ips, copy_len);

	if (size > copy_len)
		memset(buf + copy_len, 0, size - copy_len);
	return copy_len;

err_fault:
	err = -EFAULT;
clear:
	memset(buf, 0, size);
	return err;
}

const struct bpf_func_proto bpf_get_stack_proto = {
	.func		= bpf_get_stack,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg4_type	= ARG_ANYTHING,
};

/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
Loading