Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3f55b7ed authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'ebpf-tail-call'

Alexei Starovoitov says:

====================
bpf: introduce bpf_tail_call() helper

introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  bpf_tail_call(ctx, &jmp_table, index);
  ...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  if (jmp_table[index])
    return (*jmp_table[index])(ctx);
  ...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.

Use cases:
- simplify complex programs
- dispatch into other programs
  (for example: index in jump table can be syscall number or network protocol)
- build dynamic chains of programs

The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.

patch 1 - support bpf_tail_call() in interpreter
patch 2 - support in x64 JIT
We've discussed what's neccessary to support it in arm64/s390 JITs
and it looks fine.
patch 3 - sample example for tracing
patch 4 - sample example for networking

More details in every patch.

This set went through several iterations of reviews/fixes and older
attempts can be seen:
https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/log/?h=tail_call_v[123456]


- tail_call_v1 does it without touching JITs but introduces overhead
  for all programs that don't use this helper function.
- tail_call_v2 still has some overhead and x64 JIT does full stack
  unwind (prologue skipping optimization wasn't there)
- tail_call_v3 reuses 'call' instruction encoding and has interpreter
  overhead for every normal call
- tail_call_v4 fixes above architectural shortcomings and v5,v6 fix few
  more bugs

This last tail_call_v6 approach seems to be the best.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e7582bab 530b2c86
Loading
Loading
Loading
Loading
+126 −24
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <asm/cacheflush.h>
#include <linux/bpf.h>

int bpf_jit_enable __read_mostly;

@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
	return ptr + len;
}

#define EMIT(bytes, len)	do { prog = emit_code(prog, bytes, len); } while (0)
#define EMIT(bytes, len) \
	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)

#define EMIT1(b1)		EMIT(b1, 1)
#define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
#define BPF_MAX_INSN_SIZE	128
#define BPF_INSN_SAFETY		64

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
		  int oldproglen, struct jit_context *ctx)
#define STACKSIZE \
	(MAX_BPF_STACK + \
	 32 /* space for rbx, r13, r14, r15 */ + \
	 8 /* space for skb_copy_bits() buffer */)

#define PROLOGUE_SIZE 51

/* emit x64 prologue code for BPF program and check it's size.
 * bpf_tail_call helper will skip it while jumping into another program
 */
static void emit_prologue(u8 **pprog)
{
	struct bpf_insn *insn = bpf_prog->insnsi;
	int insn_cnt = bpf_prog->len;
	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
	bool seen_exit = false;
	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
	int i;
	int proglen = 0;
	u8 *prog = temp;
	int stacksize = MAX_BPF_STACK +
		32 /* space for rbx, r13, r14, r15 */ +
		8 /* space for skb_copy_bits() buffer */;
	u8 *prog = *pprog;
	int cnt = 0;

	EMIT1(0x55); /* push rbp */
	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */

	/* sub rsp, stacksize */
	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
	/* sub rsp, STACKSIZE */
	EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);

	/* all classic BPF filters use R6(rbx) save it */

	/* mov qword ptr [rbp-X],rbx */
	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
	EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);

	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
	 */

	/* mov qword ptr [rbp-X],r13 */
	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
	EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
	/* mov qword ptr [rbp-X],r14 */
	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
	EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
	/* mov qword ptr [rbp-X],r15 */
	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
	EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);

	/* clear A and X registers */
	EMIT2(0x31, 0xc0); /* xor eax, eax */
	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */

	/* clear tail_cnt: mov qword ptr [rbp-X], rax */
	EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);

	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
	*pprog = prog;
}

/* generate the following code:
 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
 *   if (index >= array->map.max_entries)
 *     goto out;
 *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
 *     goto out;
 *   prog = array->prog[index];
 *   if (prog == NULL)
 *     goto out;
 *   goto *(prog->bpf_func + prologue_size);
 * out:
 */
static void emit_bpf_tail_call(u8 **pprog)
{
	u8 *prog = *pprog;
	int label1, label2, label3;
	int cnt = 0;

	/* rdi - pointer to ctx
	 * rsi - pointer to bpf_array
	 * rdx - index in bpf_array
	 */

	/* if (index >= array->map.max_entries)
	 *   goto out;
	 */
	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
	      offsetof(struct bpf_array, map.max_entries));
	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
#define OFFSET1 44 /* number of bytes to jump */
	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
	label1 = cnt;

	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
	 *   goto out;
	 */
	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
#define OFFSET2 33
	EMIT2(X86_JA, OFFSET2);                   /* ja out */
	label2 = cnt;
	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */

	/* prog = array->prog[index]; */
	EMIT4(0x48, 0x8D, 0x44, 0xD6);            /* lea rax, [rsi + rdx * 8 + 0x50] */
	EMIT1(offsetof(struct bpf_array, prog));
	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */

	/* if (prog == NULL)
	 *   goto out;
	 */
	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
#define OFFSET3 10
	EMIT2(X86_JE, OFFSET3);                   /* je out */
	label3 = cnt;

	/* goto *(prog->bpf_func + prologue_size); */
	EMIT4(0x48, 0x8B, 0x40,                   /* mov rax, qword ptr [rax + 32] */
	      offsetof(struct bpf_prog, bpf_func));
	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */

	/* now we're ready to jump into next BPF program
	 * rdi == ctx (1st arg)
	 * rax == prog->bpf_func + prologue_size
	 */
	EMIT2(0xFF, 0xE0);                        /* jmp rax */

	/* out: */
	BUILD_BUG_ON(cnt - label1 != OFFSET1);
	BUILD_BUG_ON(cnt - label2 != OFFSET2);
	BUILD_BUG_ON(cnt - label3 != OFFSET3);
	*pprog = prog;
}

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
		  int oldproglen, struct jit_context *ctx)
{
	struct bpf_insn *insn = bpf_prog->insnsi;
	int insn_cnt = bpf_prog->len;
	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
	bool seen_exit = false;
	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
	int i, cnt = 0;
	int proglen = 0;
	u8 *prog = temp;

	emit_prologue(&prog);

	if (seen_ld_abs) {
		/* r9d : skb->len - skb->data_len (headlen)
		 * r10 : skb->data
@@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off))
			}
			break;

		case BPF_JMP | BPF_CALL | BPF_X:
			emit_bpf_tail_call(&prog);
			break;

			/* cond jump */
		case BPF_JMP | BPF_JEQ | BPF_X:
		case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +993,13 @@ xadd: if (is_imm8(insn->off))
			/* update cleanup_addr */
			ctx->cleanup_addr = proglen;
			/* mov rbx, qword ptr [rbp-X] */
			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
			EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
			/* mov r13, qword ptr [rbp-X] */
			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
			EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
			/* mov r14, qword ptr [rbp-X] */
			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
			EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
			/* mov r15, qword ptr [rbp-X] */
			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
			EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);

			EMIT1(0xC9); /* leave */
			EMIT1(0xC3); /* ret */
+22 −0
Original line number Diff line number Diff line
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
	struct work_struct work;
};

struct bpf_array {
	struct bpf_map map;
	u32 elem_size;
	/* 'ownership' of prog_array is claimed by the first program that
	 * is going to use this map or by the first program which FD is stored
	 * in the map to make sure that all callers and callees have the same
	 * prog_type and JITed flag
	 */
	enum bpf_prog_type owner_prog_type;
	bool owner_jited;
	union {
		char value[0] __aligned(8);
		struct bpf_prog *prog[0] __aligned(8);
	};
};
#define MAX_TAIL_CALL_CNT 32

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
void bpf_prog_array_map_clear(struct bpf_map *map);
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);

#ifdef CONFIG_BPF_SYSCALL
void bpf_register_prog_type(struct bpf_prog_type_list *tl);
void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;

extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;

#endif /* _LINUX_BPF_H */
+1 −1
Original line number Diff line number Diff line
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)

int sk_filter(struct sock *sk, struct sk_buff *skb);

void bpf_prog_select_runtime(struct bpf_prog *fp);
int bpf_prog_select_runtime(struct bpf_prog *fp);
void bpf_prog_free(struct bpf_prog *fp);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
+10 −0
Original line number Diff line number Diff line
@@ -113,6 +113,7 @@ enum bpf_map_type {
	BPF_MAP_TYPE_UNSPEC,
	BPF_MAP_TYPE_HASH,
	BPF_MAP_TYPE_ARRAY,
	BPF_MAP_TYPE_PROG_ARRAY,
};

enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
	 * Return: 0 on success
	 */
	BPF_FUNC_l4_csum_replace,

	/**
	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
	 * @ctx: context pointer passed to next program
	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
	 * @index: index inside array that selects specific program to run
	 * Return: 0 on success
	 */
	BPF_FUNC_tail_call,
	__BPF_FUNC_MAX_ID,
};

+107 −6
Original line number Diff line number Diff line
@@ -14,12 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>

struct bpf_array {
	struct bpf_map map;
	u32 elem_size;
	char value[0] __aligned(8);
};
#include <linux/filter.h>

/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
	return 0;
}
late_initcall(register_array_map);

static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
	/* only bpf_prog file descriptors can be stored in prog_array map */
	if (attr->value_size != sizeof(u32))
		return ERR_PTR(-EINVAL);
	return array_map_alloc(attr);
}

static void prog_array_map_free(struct bpf_map *map)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;

	synchronize_rcu();

	/* make sure it's empty */
	for (i = 0; i < array->map.max_entries; i++)
		BUG_ON(array->prog[i] != NULL);
	kvfree(array);
}

static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
{
	return NULL;
}

/* only called from syscall */
static int prog_array_map_update_elem(struct bpf_map *map, void *key,
				      void *value, u64 map_flags)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *prog, *old_prog;
	u32 index = *(u32 *)key, ufd;

	if (map_flags != BPF_ANY)
		return -EINVAL;

	if (index >= array->map.max_entries)
		return -E2BIG;

	ufd = *(u32 *)value;
	prog = bpf_prog_get(ufd);
	if (IS_ERR(prog))
		return PTR_ERR(prog);

	if (!bpf_prog_array_compatible(array, prog)) {
		bpf_prog_put(prog);
		return -EINVAL;
	}

	old_prog = xchg(array->prog + index, prog);
	if (old_prog)
		bpf_prog_put(old_prog);

	return 0;
}

static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *old_prog;
	u32 index = *(u32 *)key;

	if (index >= array->map.max_entries)
		return -E2BIG;

	old_prog = xchg(array->prog + index, NULL);
	if (old_prog) {
		bpf_prog_put(old_prog);
		return 0;
	} else {
		return -ENOENT;
	}
}

/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_prog_array_map_clear(struct bpf_map *map)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;

	for (i = 0; i < array->map.max_entries; i++)
		prog_array_map_delete_elem(map, &i);
}

static const struct bpf_map_ops prog_array_ops = {
	.map_alloc = prog_array_map_alloc,
	.map_free = prog_array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = prog_array_map_lookup_elem,
	.map_update_elem = prog_array_map_update_elem,
	.map_delete_elem = prog_array_map_delete_elem,
};

static struct bpf_map_type_list prog_array_type __read_mostly = {
	.ops = &prog_array_ops,
	.type = BPF_MAP_TYPE_PROG_ARRAY,
};

static int __init register_prog_array_map(void)
{
	bpf_register_map_type(&prog_array_type);
	return 0;
}
late_initcall(register_prog_array_map);
Loading