Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c1bf5fe0 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'bpf-unprivileged'

Alexei Starovoitov says:

====================
bpf: unprivileged

v1-v2:
- this set logically depends on cb patch
  "bpf: fix cb access in socket filter programs":
  http://patchwork.ozlabs.org/patch/527391/


  which is must have to allow unprivileged programs.
  Thanks Daniel for finding that issue.
- refactored sysctl to be similar to 'modules_disabled'
- dropped bpf_trace_printk
- split tests into separate patch and added more tests
  based on discussion

v1 cover letter:
I think it is time to liberate eBPF from CAP_SYS_ADMIN.
As was discussed when eBPF was first introduced two years ago
the only piece missing in eBPF verifier is 'pointer leak detection'
to make it available to non-root users.
Patch 1 adds this pointer analysis.
The eBPF programs, obviously, need to see and operate on kernel addresses,
but with these extra checks they won't be able to pass these addresses
to user space.
Patch 2 adds accounting of kernel memory used by programs and maps.
It changes behavoir for existing root users, but I think it needs
to be done consistently for both root and non-root, since today
programs and maps are only limited by number of open FDs (RLIMIT_NOFILE).
Patch 2 accounts program's and map's kernel memory as RLIMIT_MEMLOCK.

Unprivileged eBPF is only meaningful for 'socket filter'-like programs.
eBPF programs for tracing and TC classifiers/actions will stay root only.

In parallel the bpf fuzzing effort is ongoing and so far
we've found only one verifier bug and that was already fixed.
The 'constant blinding' pass also being worked on.
It will obfuscate constant-like values that are part of eBPF ISA
to make jit spraying attacks even harder.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 0fa28877 bf508877
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@ struct bpf_map {
	u32 key_size;
	u32 value_size;
	u32 max_entries;
	u32 pages;
	struct user_struct *user;
	const struct bpf_map_ops *ops;
	struct work_struct work;
};
@@ -128,6 +130,7 @@ struct bpf_prog_aux {
	const struct bpf_verifier_ops *ops;
	struct bpf_map **used_maps;
	struct bpf_prog *prog;
	struct user_struct *user;
	union {
		struct work_struct work;
		struct rcu_head	rcu;
@@ -167,6 +170,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog);
struct bpf_map *bpf_map_get(struct fd f);
void bpf_map_put(struct bpf_map *map);

extern int sysctl_unprivileged_bpf_disabled;

/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
#else
+1 −1
Original line number Diff line number Diff line
@@ -840,7 +840,7 @@ struct user_struct {
	struct hlist_node uidhash_node;
	kuid_t uid;

#ifdef CONFIG_PERF_EVENTS
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
	atomic_long_t locked_vm;
#endif
};
+1 −1
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
	array->map.key_size = attr->key_size;
	array->map.value_size = attr->value_size;
	array->map.max_entries = attr->max_entries;

	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
	array->elem_size = elem_size;

	return &array->map;
+4 −0
Original line number Diff line number Diff line
@@ -88,6 +88,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	htab->elem_size = sizeof(struct htab_elem) +
			  round_up(htab->map.key_size, 8) +
			  htab->map.value_size;

	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
				   htab->elem_size * htab->map.max_entries,
				   PAGE_SIZE) >> PAGE_SHIFT;
	return &htab->map;

free_htab:
+69 −5
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>

int sysctl_unprivileged_bpf_disabled __read_mostly;

static LIST_HEAD(bpf_map_types);

static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
	list_add(&tl->list_node, &bpf_map_types);
}

static int bpf_map_charge_memlock(struct bpf_map *map)
{
	struct user_struct *user = get_current_user();
	unsigned long memlock_limit;

	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

	atomic_long_add(map->pages, &user->locked_vm);

	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
		atomic_long_sub(map->pages, &user->locked_vm);
		free_uid(user);
		return -EPERM;
	}
	map->user = user;
	return 0;
}

static void bpf_map_uncharge_memlock(struct bpf_map *map)
{
	struct user_struct *user = map->user;

	atomic_long_sub(map->pages, &user->locked_vm);
	free_uid(user);
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

	bpf_map_uncharge_memlock(map);
	/* implementation dependent freeing */
	map->ops->map_free(map);
}
@@ -108,6 +137,10 @@ static int map_create(union bpf_attr *attr)

	atomic_set(&map->refcnt, 1);

	err = bpf_map_charge_memlock(map);
	if (err)
		goto free_map;

	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);

	if (err < 0)
@@ -440,11 +473,37 @@ static void free_used_maps(struct bpf_prog_aux *aux)
	kfree(aux->used_maps);
}

static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
	struct user_struct *user = get_current_user();
	unsigned long memlock_limit;

	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

	atomic_long_add(prog->pages, &user->locked_vm);
	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
		atomic_long_sub(prog->pages, &user->locked_vm);
		free_uid(user);
		return -EPERM;
	}
	prog->aux->user = user;
	return 0;
}

static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
	struct user_struct *user = prog->aux->user;

	atomic_long_sub(prog->pages, &user->locked_vm);
	free_uid(user);
}

static void __prog_put_rcu(struct rcu_head *rcu)
{
	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

	free_used_maps(aux);
	bpf_prog_uncharge_memlock(aux->prog);
	bpf_prog_free(aux->prog);
}

@@ -544,11 +603,18 @@ static int bpf_prog_load(union bpf_attr *attr)
	    attr->kern_version != LINUX_VERSION_CODE)
		return -EINVAL;

	if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
		return -EPERM;

	/* plain bpf_prog allocation */
	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	if (!prog)
		return -ENOMEM;

	err = bpf_prog_charge_memlock(prog);
	if (err)
		goto free_prog_nouncharge;

	prog->len = attr->insn_cnt;

	err = -EFAULT;
@@ -590,6 +656,8 @@ static int bpf_prog_load(union bpf_attr *attr)
free_used_maps:
	free_used_maps(prog->aux);
free_prog:
	bpf_prog_uncharge_memlock(prog);
free_prog_nouncharge:
	bpf_prog_free(prog);
	return err;
}
@@ -599,11 +667,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
	union bpf_attr attr = {};
	int err;

	/* the syscall is limited to root temporarily. This restriction will be
	 * lifted when security audit is clean. Note that eBPF+tracing must have
	 * this restriction, since it may pass kernel data to user space
	 */
	if (!capable(CAP_SYS_ADMIN))
	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
		return -EPERM;

	if (!access_ok(VERIFY_READ, uattr, 1))
Loading