Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8ac2c867 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'bpf-per-cpu-maps'

Alexei Starovoitov says:

====================
bpf: introduce per-cpu maps

We've started to use bpf to trace every packet and atomic add
instruction (event JITed) started to show up in perf profile.
The solution is to do per-cpu counters.
For PERCPU_(HASH|ARRAY) map the existing bpf_map_lookup() helper
returns per-cpu area which bpf programs can use to store and
increment the counters. The BPF_MAP_LOOKUP_ELEM syscall command
returns areas from all cpus and user process aggregates the counters.
The usage example is in patch 6. The api turned out to be very
easy to use from bpf program and from user space.
Long term we were discussing to add 'bounded loop' instruction,
so bpf programs can do aggregation within the program which may
help some use cases. Right now user space aggregation of
per-cpu counters fits the best.

This patch set is new approach for per-cpu hash and array maps.
I've reused the map tests written by Martin and Ming, but
implementation and api is new. Old discussion here:
http://thread.gmane.org/gmane.linux.kernel/2123800/focus=2126435


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ba905f5e 3059303f
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -151,6 +151,7 @@ struct bpf_array {
	union {
		char value[0] __aligned(8);
		void *ptrs[0] __aligned(8);
		void __percpu *pptrs[0] __aligned(8);
	};
};
#define MAX_TAIL_CALL_CNT 32
@@ -182,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
int bpf_obj_get_user(const char __user *pathname);

int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
			   u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
			    u64 flags);

/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 * forced to use 'long' read/writes to try to atomically copy long counters.
 * Best-effort only.  No barriers here, since it _will_ race with concurrent
 * updates from BPF programs. Called from bpf syscall and mostly used with
 * size 8 or 16 bytes, so ask compiler to inline it.
 */
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
{
	const long *lsrc = src;
	long *ldst = dst;

	size /= sizeof(long);
	while (size--)
		*ldst++ = *lsrc++;
}

/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
#else
+2 −0
Original line number Diff line number Diff line
@@ -81,6 +81,8 @@ enum bpf_map_type {
	BPF_MAP_TYPE_ARRAY,
	BPF_MAP_TYPE_PROG_ARRAY,
	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	BPF_MAP_TYPE_PERCPU_HASH,
	BPF_MAP_TYPE_PERCPU_ARRAY,
};

enum bpf_prog_type {
+155 −11
Original line number Diff line number Diff line
@@ -17,11 +17,39 @@
#include <linux/filter.h>
#include <linux/perf_event.h>

static void bpf_array_free_percpu(struct bpf_array *array)
{
	int i;

	for (i = 0; i < array->map.max_entries; i++)
		free_percpu(array->pptrs[i]);
}

static int bpf_array_alloc_percpu(struct bpf_array *array)
{
	void __percpu *ptr;
	int i;

	for (i = 0; i < array->map.max_entries; i++) {
		ptr = __alloc_percpu_gfp(array->elem_size, 8,
					 GFP_USER | __GFP_NOWARN);
		if (!ptr) {
			bpf_array_free_percpu(array);
			return -ENOMEM;
		}
		array->pptrs[i] = ptr;
	}

	return 0;
}

/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
	struct bpf_array *array;
	u32 elem_size, array_size;
	u64 array_size;
	u32 elem_size;

	/* check sanity of attributes */
	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)

	elem_size = round_up(attr->value_size, 8);

	/* check round_up into zero and u32 overflow */
	if (elem_size == 0 ||
	    attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
	array_size = sizeof(*array);
	if (percpu)
		array_size += (u64) attr->max_entries * sizeof(void *);
	else
		array_size += (u64) attr->max_entries * elem_size;

	/* make sure there is no u32 overflow later in round_up() */
	if (array_size >= U32_MAX - PAGE_SIZE)
		return ERR_PTR(-ENOMEM);

	array_size = sizeof(*array) + attr->max_entries * elem_size;

	/* allocate all map elements and zero-initialize them */
	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
	}

	/* copy mandatory map attributes */
	array->map.map_type = attr->map_type;
	array->map.key_size = attr->key_size;
	array->map.value_size = attr->value_size;
	array->map.max_entries = attr->max_entries;
	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
	array->elem_size = elem_size;

	if (!percpu)
		goto out;

	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();

	if (array_size >= U32_MAX - PAGE_SIZE ||
	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
		kvfree(array);
		return ERR_PTR(-ENOMEM);
	}
out:
	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;

	return &array->map;
}

@@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

	if (index >= array->map.max_entries)
	if (unlikely(index >= array->map.max_entries))
		return NULL;

	return array->value + array->elem_size * index;
}

/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

	if (unlikely(index >= array->map.max_entries))
		return NULL;

	return this_cpu_ptr(array->pptrs[index]);
}

int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;
	void __percpu *pptr;
	int cpu, off = 0;
	u32 size;

	if (unlikely(index >= array->map.max_entries))
		return -ENOENT;

	/* per_cpu areas are zero-filled and bpf programs can only
	 * access 'value_size' of them, so copying rounded areas
	 * will not leak any kernel data
	 */
	size = round_up(map->value_size, 8);
	rcu_read_lock();
	pptr = array->pptrs[index];
	for_each_possible_cpu(cpu) {
		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
		off += size;
	}
	rcu_read_unlock();
	return 0;
}

/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

	if (map_flags > BPF_EXIST)
	if (unlikely(map_flags > BPF_EXIST))
		/* unknown flags */
		return -EINVAL;

	if (index >= array->map.max_entries)
	if (unlikely(index >= array->map.max_entries))
		/* all elements were pre-allocated, cannot insert a new one */
		return -E2BIG;

	if (map_flags == BPF_NOEXIST)
	if (unlikely(map_flags == BPF_NOEXIST))
		/* all elements already exist */
		return -EEXIST;

	memcpy(array->value + array->elem_size * index, value, map->value_size);
	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		memcpy(this_cpu_ptr(array->pptrs[index]),
		       value, map->value_size);
	else
		memcpy(array->value + array->elem_size * index,
		       value, map->value_size);
	return 0;
}

int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
			    u64 map_flags)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;
	void __percpu *pptr;
	int cpu, off = 0;
	u32 size;

	if (unlikely(map_flags > BPF_EXIST))
		/* unknown flags */
		return -EINVAL;

	if (unlikely(index >= array->map.max_entries))
		/* all elements were pre-allocated, cannot insert a new one */
		return -E2BIG;

	if (unlikely(map_flags == BPF_NOEXIST))
		/* all elements already exist */
		return -EEXIST;

	/* the user space will provide round_up(value_size, 8) bytes that
	 * will be copied into per-cpu area. bpf programs can only access
	 * value_size of it. During lookup the same extra bytes will be
	 * returned or zeros which were zero-filled by percpu_alloc,
	 * so no kernel data leaks possible
	 */
	size = round_up(map->value_size, 8);
	rcu_read_lock();
	pptr = array->pptrs[index];
	for_each_possible_cpu(cpu) {
		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
		off += size;
	}
	rcu_read_unlock();
	return 0;
}

@@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map)
	 */
	synchronize_rcu();

	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		bpf_array_free_percpu(array);

	kvfree(array);
}

@@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
	.type = BPF_MAP_TYPE_ARRAY,
};

static const struct bpf_map_ops percpu_array_ops = {
	.map_alloc = array_map_alloc,
	.map_free = array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = percpu_array_map_lookup_elem,
	.map_update_elem = array_map_update_elem,
	.map_delete_elem = array_map_delete_elem,
};

static struct bpf_map_type_list percpu_array_type __read_mostly = {
	.ops = &percpu_array_ops,
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
};

static int __init register_array_map(void)
{
	bpf_register_map_type(&array_type);
	bpf_register_map_type(&percpu_array_type);
	return 0;
}
late_initcall(register_array_map);
+293 −47
Original line number Diff line number Diff line
@@ -31,21 +31,27 @@ struct bpf_htab {
struct htab_elem {
	struct hlist_node hash_node;
	struct rcu_head rcu;
	union {
		u32 hash;
		u32 key_size;
	};
	char key[0] __aligned(8);
};

/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
	struct bpf_htab *htab;
	int err, i;
	u64 cost;

	htab = kzalloc(sizeof(*htab), GFP_USER);
	if (!htab)
		return ERR_PTR(-ENOMEM);

	/* mandatory map attributes */
	htab->map.map_type = attr->map_type;
	htab->map.key_size = attr->key_size;
	htab->map.value_size = attr->value_size;
	htab->map.max_entries = attr->max_entries;
@@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
		 */
		goto free_htab;

	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
		/* make sure the size for pcpu_alloc() is reasonable */
		goto free_htab;

	htab->elem_size = sizeof(struct htab_elem) +
			  round_up(htab->map.key_size, 8) +
			  htab->map.value_size;
			  round_up(htab->map.key_size, 8);
	if (percpu)
		htab->elem_size += sizeof(void *);
	else
		htab->elem_size += htab->map.value_size;

	/* prevent zero size kmalloc and check for u32 overflow */
	if (htab->n_buckets == 0 ||
	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
		goto free_htab;

	if ((u64) htab->n_buckets * sizeof(struct bucket) +
	    (u64) htab->elem_size * htab->map.max_entries >=
	    U32_MAX - PAGE_SIZE)
	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
	       (u64) htab->elem_size * htab->map.max_entries;

	if (percpu)
		cost += (u64) round_up(htab->map.value_size, 8) *
			num_possible_cpus() * htab->map.max_entries;

	if (cost >= U32_MAX - PAGE_SIZE)
		/* make sure page count doesn't overflow */
		goto free_htab;

	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
				   htab->elem_size * htab->map.max_entries,
				   PAGE_SIZE) >> PAGE_SHIFT;
	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;

	err = -ENOMEM;
	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
@@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
}

/* Called from syscall or from eBPF program */
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
	struct hlist_head *head;
@@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)

	l = lookup_elem_raw(head, hash, key, key_size);

	return l;
}

static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct htab_elem *l = __htab_map_lookup_elem(map, key);

	if (l)
		return l->key + round_up(map->key_size, 8);

@@ -230,65 +253,149 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
	return -ENOENT;
}


static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
				     void __percpu *pptr)
{
	*(void __percpu **)(l->key + key_size) = pptr;
}

static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
	return *(void __percpu **)(l->key + key_size);
}

static void htab_percpu_elem_free(struct htab_elem *l)
{
	free_percpu(htab_elem_get_ptr(l, l->key_size));
	kfree(l);
}

static void htab_percpu_elem_free_rcu(struct rcu_head *head)
{
	struct htab_elem *l = container_of(head, struct htab_elem, rcu);

	htab_percpu_elem_free(l);
}

static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
{
	if (percpu) {
		l->key_size = key_size;
		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
	} else {
		kfree_rcu(l, rcu);
	}
}

static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
					 void *value, u32 key_size, u32 hash,
					 bool percpu, bool onallcpus)
{
	u32 size = htab->map.value_size;
	struct htab_elem *l_new;
	void __percpu *pptr;

	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
	if (!l_new)
		return NULL;

	memcpy(l_new->key, key, key_size);
	if (percpu) {
		/* round up value_size to 8 bytes */
		size = round_up(size, 8);

		/* alloc_percpu zero-fills */
		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
		if (!pptr) {
			kfree(l_new);
			return NULL;
		}

		if (!onallcpus) {
			/* copy true value_size bytes */
			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
		} else {
			int off = 0, cpu;

			for_each_possible_cpu(cpu) {
				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
						value + off, size);
				off += size;
			}
		}
		htab_elem_set_ptr(l_new, key_size, pptr);
	} else {
		memcpy(l_new->key + round_up(key_size, 8), value, size);
	}

	l_new->hash = hash;
	return l_new;
}

static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
		       u64 map_flags)
{
	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
		/* if elem with this 'key' doesn't exist and we've reached
		 * max_entries limit, fail insertion of new elem
		 */
		return -E2BIG;

	if (l_old && map_flags == BPF_NOEXIST)
		/* elem already exists */
		return -EEXIST;

	if (!l_old && map_flags == BPF_EXIST)
		/* elem doesn't exist, cannot update it */
		return -ENOENT;

	return 0;
}

/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
				u64 map_flags)
{
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
	struct htab_elem *l_new, *l_old;
	struct htab_elem *l_new = NULL, *l_old;
	struct hlist_head *head;
	struct bucket *b;
	unsigned long flags;
	u32 key_size;
	struct bucket *b;
	u32 key_size, hash;
	int ret;

	if (map_flags > BPF_EXIST)
	if (unlikely(map_flags > BPF_EXIST))
		/* unknown flags */
		return -EINVAL;

	WARN_ON_ONCE(!rcu_read_lock_held());

	/* allocate new element outside of lock */
	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
	if (!l_new)
		return -ENOMEM;

	key_size = map->key_size;

	memcpy(l_new->key, key, key_size);
	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
	hash = htab_map_hash(key, key_size);

	/* allocate new element outside of the lock, since
	 * we're most likley going to insert it
	 */
	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
	if (!l_new)
		return -ENOMEM;

	l_new->hash = htab_map_hash(l_new->key, key_size);
	b = __select_bucket(htab, l_new->hash);
	b = __select_bucket(htab, hash);
	head = &b->head;

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);

	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
	l_old = lookup_elem_raw(head, hash, key, key_size);

	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
		/* if elem with this 'key' doesn't exist and we've reached
		 * max_entries limit, fail insertion of new elem
		 */
		ret = -E2BIG;
	ret = check_flags(htab, l_old, map_flags);
	if (ret)
		goto err;
	}

	if (l_old && map_flags == BPF_NOEXIST) {
		/* elem already exists */
		ret = -EEXIST;
		goto err;
	}

	if (!l_old && map_flags == BPF_EXIST) {
		/* elem doesn't exist, cannot update it */
		ret = -ENOENT;
		goto err;
	}

	/* add new element to the head of the list, so that concurrent
	 * search will find it before old elem
	/* add new element to the head of the list, so that
	 * concurrent search will find it before old elem
	 */
	hlist_add_head_rcu(&l_new->hash_node, head);
	if (l_old) {
@@ -298,7 +405,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
		atomic_inc(&htab->count);
	}
	raw_spin_unlock_irqrestore(&b->lock, flags);

	return 0;
err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
@@ -306,10 +412,84 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
	return ret;
}

static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
					 void *value, u64 map_flags,
					 bool onallcpus)
{
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
	struct htab_elem *l_new = NULL, *l_old;
	struct hlist_head *head;
	unsigned long flags;
	struct bucket *b;
	u32 key_size, hash;
	int ret;

	if (unlikely(map_flags > BPF_EXIST))
		/* unknown flags */
		return -EINVAL;

	WARN_ON_ONCE(!rcu_read_lock_held());

	key_size = map->key_size;

	hash = htab_map_hash(key, key_size);

	b = __select_bucket(htab, hash);
	head = &b->head;

	/* bpf_map_update_elem() can be called in_irq() */
	raw_spin_lock_irqsave(&b->lock, flags);

	l_old = lookup_elem_raw(head, hash, key, key_size);

	ret = check_flags(htab, l_old, map_flags);
	if (ret)
		goto err;

	if (l_old) {
		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
		u32 size = htab->map.value_size;

		/* per-cpu hash map can update value in-place */
		if (!onallcpus) {
			memcpy(this_cpu_ptr(pptr), value, size);
		} else {
			int off = 0, cpu;

			size = round_up(size, 8);
			for_each_possible_cpu(cpu) {
				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
						value + off, size);
				off += size;
			}
		}
	} else {
		l_new = alloc_htab_elem(htab, key, value, key_size,
					hash, true, onallcpus);
		if (!l_new) {
			ret = -ENOMEM;
			goto err;
		}
		hlist_add_head_rcu(&l_new->hash_node, head);
		atomic_inc(&htab->count);
	}
	ret = 0;
err:
	raw_spin_unlock_irqrestore(&b->lock, flags);
	return ret;
}

static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
				       void *value, u64 map_flags)
{
	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}

/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
	struct hlist_head *head;
	struct bucket *b;
	struct htab_elem *l;
@@ -332,7 +512,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
	if (l) {
		hlist_del_rcu(&l->hash_node);
		atomic_dec(&htab->count);
		kfree_rcu(l, rcu);
		free_htab_elem(l, percpu, key_size);
		ret = 0;
	}

@@ -352,10 +532,15 @@ static void delete_all_elements(struct bpf_htab *htab)
		hlist_for_each_entry_safe(l, n, head, hash_node) {
			hlist_del_rcu(&l->hash_node);
			atomic_dec(&htab->count);
			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
				l->key_size = htab->map.key_size;
				htab_percpu_elem_free(l);
			} else {
				kfree(l);
			}
		}
	}
}

/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
@@ -391,9 +576,70 @@ static struct bpf_map_type_list htab_type __read_mostly = {
	.type = BPF_MAP_TYPE_HASH,
};

/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct htab_elem *l = __htab_map_lookup_elem(map, key);

	if (l)
		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
	else
		return NULL;
}

int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
	struct htab_elem *l;
	void __percpu *pptr;
	int ret = -ENOENT;
	int cpu, off = 0;
	u32 size;

	/* per_cpu areas are zero-filled and bpf programs can only
	 * access 'value_size' of them, so copying rounded areas
	 * will not leak any kernel data
	 */
	size = round_up(map->value_size, 8);
	rcu_read_lock();
	l = __htab_map_lookup_elem(map, key);
	if (!l)
		goto out;
	pptr = htab_elem_get_ptr(l, map->key_size);
	for_each_possible_cpu(cpu) {
		bpf_long_memcpy(value + off,
				per_cpu_ptr(pptr, cpu), size);
		off += size;
	}
	ret = 0;
out:
	rcu_read_unlock();
	return ret;
}

int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
			   u64 map_flags)
{
	return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
}

static const struct bpf_map_ops htab_percpu_ops = {
	.map_alloc = htab_map_alloc,
	.map_free = htab_map_free,
	.map_get_next_key = htab_map_get_next_key,
	.map_lookup_elem = htab_percpu_map_lookup_elem,
	.map_update_elem = htab_percpu_map_update_elem,
	.map_delete_elem = htab_map_delete_elem,
};

static struct bpf_map_type_list htab_percpu_type __read_mostly = {
	.ops = &htab_percpu_ops,
	.type = BPF_MAP_TYPE_PERCPU_HASH,
};

static int __init register_htab_map(void)
{
	bpf_register_map_type(&htab_type);
	bpf_register_map_type(&htab_percpu_type);
	return 0;
}
late_initcall(register_htab_map);
+40 −17
Original line number Diff line number Diff line
@@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *value, *ptr;
	u32 value_size;
	struct fd f;
	int err;

@@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
	else
		value_size = map->value_size;

	err = -ENOMEM;
	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
	if (!value)
		goto free_key;

	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
		err = bpf_percpu_hash_copy(map, key, value);
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
		err = bpf_percpu_array_copy(map, key, value);
	} else {
		rcu_read_lock();
		ptr = map->ops->map_lookup_elem(map, key);
		if (ptr)
		memcpy(value, ptr, map->value_size);
			memcpy(value, ptr, value_size);
		rcu_read_unlock();
		err = ptr ? 0 : -ENOENT;
	}

	err = -ENOENT;
	if (!ptr)
	if (err)
		goto free_value;

	err = -EFAULT;
	if (copy_to_user(uvalue, value, map->value_size) != 0)
	if (copy_to_user(uvalue, value, value_size) != 0)
		goto free_value;

	err = 0;
@@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *value;
	u32 value_size;
	struct fd f;
	int err;

@@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
	else
		value_size = map->value_size;

	err = -ENOMEM;
	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
	if (!value)
		goto free_key;

	err = -EFAULT;
	if (copy_from_user(value, uvalue, map->value_size) != 0)
	if (copy_from_user(value, uvalue, value_size) != 0)
		goto free_value;

	/* eBPF program that use maps are running under rcu_read_lock(),
	 * therefore all map accessors rely on this fact, so do the same here
	 */
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
		err = bpf_percpu_hash_update(map, key, value, attr->flags);
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
		err = bpf_percpu_array_update(map, key, value, attr->flags);
	} else {
		rcu_read_lock();
		err = map->ops->map_update_elem(map, key, value, attr->flags);
		rcu_read_unlock();
	}

free_value:
	kfree(value);
Loading