Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d74a790d authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'bpf-perf'

Kaixu Xia says:

====================
bpf: Introduce the new ability of eBPF programs to access hardware PMU counter

This patchset is base on the net-next:
 git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
commit 9dc20a64.

Previous patch v6 url:
https://lkml.org/lkml/2015/8/4/188



changes in V7:
 - rebase the whole patch set to net-next tree(9dc20a64);
 - split out the core perf APIs into Patch 1/5;
 - change the return value of function perf_event_attrs()
   from struct perf_event * to const struct perf_event * in
   Patch 1/5;
 - rename the function perf_event_read_internal() to perf_event_
   read_local() and rewrite it in Patch 1/5;
 - rename the function check_func_limit() to check_map_func
   _compatibility() and remove the unnecessary pass pointer to
   a pointer in Patch 4/5;

changes in V6:
 - make the Patch 1/4 commit message more meaning and readable;
 - remove the unnecessary comment in Patch 2/4 and make it clean;
 - declare the function perf_event_release_kernel() in include/
   linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS
   isn't configured in Patch 2/4;
 - add function perf_event_attrs() to get the struct perf_event_attr
   in Patch 2/4.
 - move the related code from kernel/trace/bpf_trace.c to kernel/
   events/core.c and add function perf_event_read_internal() to
   avoid poking inside of the event outside of perf code in Patch 3/4;
 - generial the func & map match-pair with an array in Patch 3/4;

changes in V5:
 - move struct fd_array_map_ops* fd_ops to bpf_map;
 - move array perf event decrement refcnt function to
   map_free;
 - fix the NULL ptr of perf_event_get();
 - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c;
 - get rid of the remaining struct bpf_prog;
 - move the unnecessay cast on void *;

changes in V4:
 - make the bpf_prog_array_map more generic;
 - fix the bug of event refcnt leak;
 - use more useful errno in bpf_perf_event_read();

changes in V3:
 - collapse V2 patches 1-3 into one;
 - drop the function map->ops->map_traverse_elem() and release
   the struct perf_event in map_free;
 - only allow to access bpf_perf_event_read() from programs;
 - update the perf_event_array_map elem via xchg();
 - pass index directly to bpf_perf_event_read() instead of
   MAP_KEY;

changes in V2:
 - put atomic_long_inc_not_zero() between fdget() and fdput();
 - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
 - Only read the event counter on current CPU or on current
   process;
 - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
   pointer to the struct perf_event;
 - according to the perf_event_map_fd and key, the function
   bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 5/5 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

  $ cat /sys/kernel/debug/tracing/trace_pipe
  $ ./tracex6
       ...
       syslog-ng-548   [000] d..1    76.905673: : CPU-0   681765271
       syslog-ng-548   [000] d..1    76.905690: : CPU-0   681787855
       syslog-ng-548   [000] d..1    76.905707: : CPU-0   681810504
       syslog-ng-548   [000] d..1    76.905725: : CPU-0   681834771
       syslog-ng-548   [000] d..1    76.905745: : CPU-0   681859519
       syslog-ng-548   [000] d..1    76.905766: : CPU-0   681890419
       syslog-ng-548   [000] d..1    76.905783: : CPU-0   681914045
       syslog-ng-548   [000] d..1    76.905800: : CPU-0   681935950
       syslog-ng-548   [000] d..1    76.905816: : CPU-0   681958299
              ls-690   [005] d..1    82.241308: : CPU-5   3138451
              sh-691   [004] d..1    82.244570: : CPU-4   7324988
           <...>-699   [007] d..1    99.961387: : CPU-7   3194027
           <...>-695   [003] d..1    99.961474: : CPU-3   288901
           <...>-695   [003] d..1    99.961541: : CPU-3   383145
           <...>-695   [003] d..1    99.961591: : CPU-3   450365
           <...>-695   [003] d..1    99.961639: : CPU-3   515751
           <...>-695   [003] d..1    99.961686: : CPU-3   579047
       ...

The detail of patches is as follow:

Patch 1/5 add the necessary core perf APIs perf_event_attrs(),
perf_event_get(),perf_event_read_local() when accessing events
counters in eBPF programs

Patch 2/5 rewrites part of the bpf_prog_array map code and make it
more generic;

Patch 3/5 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 4/5 implements function bpf_perf_event_read() that get the
selected hardware PMU conuter;

Patch 5/5 gives a simple example.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f1d5ca43 47efb302
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
 *     goto out;
 *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
 *     goto out;
 *   prog = array->prog[index];
 *   prog = array->ptrs[index];
 *   if (prog == NULL)
 *     goto out;
 *   goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */

	/* prog = array->prog[index]; */
	/* prog = array->ptrs[index]; */
	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
		    offsetof(struct bpf_array, prog));
		    offsetof(struct bpf_array, ptrs));
	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */

	/* if (prog == NULL)
+8 −2
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
#include <linux/file.h>
#include <linux/perf_event.h>

struct bpf_map;

@@ -24,6 +25,10 @@ struct bpf_map_ops {
	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
	int (*map_delete_elem)(struct bpf_map *map, void *key);

	/* funcs called by prog_array and perf_event_array map */
	void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
	void (*map_fd_put_ptr) (void *ptr);
};

struct bpf_map {
@@ -142,13 +147,13 @@ struct bpf_array {
	bool owner_jited;
	union {
		char value[0] __aligned(8);
		struct bpf_prog *prog[0] __aligned(8);
		void *ptrs[0] __aligned(8);
	};
};
#define MAX_TAIL_CALL_CNT 32

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
void bpf_prog_array_map_clear(struct bpf_map *map);
void bpf_fd_array_map_clear(struct bpf_map *map);
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);

@@ -185,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;

extern const struct bpf_func_proto bpf_perf_event_read_proto;
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
+10 −0
Original line number Diff line number Diff line
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct perf_event *perf_event_get(unsigned int fd);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
				void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
				int src_cpu, int dst_cpu);
extern u64 perf_event_read_local(struct perf_event *event);
extern u64 perf_event_read_value(struct perf_event *event,
				 u64 *enabled, u64 *running);

@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)	{ }
static inline void perf_event_free_task(struct task_struct *task)	{ }
static inline void perf_event_delayed_put(struct task_struct *task)	{ }
static inline struct perf_event *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
	return ERR_PTR(-EINVAL);
}
static inline u64 perf_event_read_local(struct perf_event *event)	{ return -EINVAL; }
static inline void perf_event_print_debug(void)				{ }
static inline int perf_event_task_disable(void)				{ return -EINVAL; }
static inline int perf_event_task_enable(void)				{ return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event) { }
static inline void perf_event_disable(struct perf_event *event)		{ }
static inline int __perf_event_disable(void *info)			{ return -1; }
static inline void perf_event_task_tick(void)				{ }
static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
+2 −0
Original line number Diff line number Diff line
@@ -114,6 +114,7 @@ enum bpf_map_type {
	BPF_MAP_TYPE_HASH,
	BPF_MAP_TYPE_ARRAY,
	BPF_MAP_TYPE_PROG_ARRAY,
	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};

enum bpf_prog_type {
@@ -270,6 +271,7 @@ enum bpf_func_id {
	 */
	BPF_FUNC_skb_get_tunnel_key,
	BPF_FUNC_skb_set_tunnel_key,
	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
	__BPF_FUNC_MAX_ID,
};

+106 −31
Original line number Diff line number Diff line
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
}
late_initcall(register_array_map);

static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
	/* only bpf_prog file descriptors can be stored in prog_array map */
	/* only file descriptors can be stored in this type of map */
	if (attr->value_size != sizeof(u32))
		return ERR_PTR(-EINVAL);
	return array_map_alloc(attr);
}

static void prog_array_map_free(struct bpf_map *map)
static void fd_array_map_free(struct bpf_map *map)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)

	/* make sure it's empty */
	for (i = 0; i < array->map.max_entries; i++)
		BUG_ON(array->prog[i] != NULL);
		BUG_ON(array->ptrs[i] != NULL);
	kvfree(array);
}

static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
	return NULL;
}

/* only called from syscall */
static int prog_array_map_update_elem(struct bpf_map *map, void *key,
static int fd_array_map_update_elem(struct bpf_map *map, void *key,
				    void *value, u64 map_flags)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *prog, *old_prog;
	void *new_ptr, *old_ptr;
	u32 index = *(u32 *)key, ufd;

	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
		return -E2BIG;

	ufd = *(u32 *)value;
	prog = bpf_prog_get(ufd);
	if (IS_ERR(prog))
		return PTR_ERR(prog);

	if (!bpf_prog_array_compatible(array, prog)) {
		bpf_prog_put(prog);
		return -EINVAL;
	}
	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
	if (IS_ERR(new_ptr))
		return PTR_ERR(new_ptr);

	old_prog = xchg(array->prog + index, prog);
	if (old_prog)
		bpf_prog_put_rcu(old_prog);
	old_ptr = xchg(array->ptrs + index, new_ptr);
	if (old_ptr)
		map->ops->map_fd_put_ptr(old_ptr);

	return 0;
}

static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *old_prog;
	void *old_ptr;
	u32 index = *(u32 *)key;

	if (index >= array->map.max_entries)
		return -E2BIG;

	old_prog = xchg(array->prog + index, NULL);
	if (old_prog) {
		bpf_prog_put_rcu(old_prog);
	old_ptr = xchg(array->ptrs + index, NULL);
	if (old_ptr) {
		map->ops->map_fd_put_ptr(old_ptr);
		return 0;
	} else {
		return -ENOENT;
	}
}

static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *prog = bpf_prog_get(fd);
	if (IS_ERR(prog))
		return prog;

	if (!bpf_prog_array_compatible(array, prog)) {
		bpf_prog_put(prog);
		return ERR_PTR(-EINVAL);
	}
	return prog;
}

static void prog_fd_array_put_ptr(void *ptr)
{
	struct bpf_prog *prog = ptr;

	bpf_prog_put_rcu(prog);
}

/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_prog_array_map_clear(struct bpf_map *map)
void bpf_fd_array_map_clear(struct bpf_map *map)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;

	for (i = 0; i < array->map.max_entries; i++)
		prog_array_map_delete_elem(map, &i);
		fd_array_map_delete_elem(map, &i);
}

static const struct bpf_map_ops prog_array_ops = {
	.map_alloc = prog_array_map_alloc,
	.map_free = prog_array_map_free,
	.map_alloc = fd_array_map_alloc,
	.map_free = fd_array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = prog_array_map_lookup_elem,
	.map_update_elem = prog_array_map_update_elem,
	.map_delete_elem = prog_array_map_delete_elem,
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_update_elem = fd_array_map_update_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = prog_fd_array_get_ptr,
	.map_fd_put_ptr = prog_fd_array_put_ptr,
};

static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
	return 0;
}
late_initcall(register_prog_array_map);

static void perf_event_array_map_free(struct bpf_map *map)
{
	bpf_fd_array_map_clear(map);
	fd_array_map_free(map);
}

static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
	struct perf_event *event;
	const struct perf_event_attr *attr;

	event = perf_event_get(fd);
	if (IS_ERR(event))
		return event;

	attr = perf_event_attrs(event);
	if (IS_ERR(attr))
		return (void *)attr;

	if (attr->type != PERF_TYPE_RAW &&
	    attr->type != PERF_TYPE_HARDWARE) {
		perf_event_release_kernel(event);
		return ERR_PTR(-EINVAL);
	}
	return event;
}

static void perf_event_fd_array_put_ptr(void *ptr)
{
	struct perf_event *event = ptr;

	perf_event_release_kernel(event);
}

static const struct bpf_map_ops perf_event_array_ops = {
	.map_alloc = fd_array_map_alloc,
	.map_free = perf_event_array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_update_elem = fd_array_map_update_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
};

static struct bpf_map_type_list perf_event_array_type __read_mostly = {
	.ops = &perf_event_array_ops,
	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};

static int __init register_perf_event_array_map(void)
{
	bpf_register_map_type(&perf_event_array_type);
	return 0;
}
late_initcall(register_perf_event_array_map);
Loading