Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 96eabe7a authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by David S. Miller
Browse files

bpf: Allow selecting numa node during map creation



The current map creation API does not allow to provide the numa-node
preference.  The memory usually comes from where the map-creation-process
is running.  The performance is not ideal if the bpf_prog is known to
always run in a numa node different from the map-creation-process.

One of the use case is sharding on CPU to different LRU maps (i.e.
an array of LRU maps).  Here is the test result of map_perf_test on
the INNER_LRU_HASH_PREALLOC test if we force the lru map used by
CPU0 to be allocated from a remote numa node:

[ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ]

># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec
4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec
3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec
6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec
2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec
1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec
7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec
0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec  #<<<

After specifying numa node:
># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec
3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec
1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec
6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec
2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec
4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec
7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec
0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<<

This patch adds one field, numa_node, to the bpf_attr.  Since numa node 0
is a valid node, a new flag BPF_F_NUMA_NODE is also added.  The numa_node
field is honored if and only if the BPF_F_NUMA_NODE flag is set.

Numa node selection is not supported for percpu map.

This patch does not change all the kmalloc.  F.e.
'htab = kzalloc()' is not changed since the object
is small enough to stay in the cache.

Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Acked-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarAlexei Starovoitov <ast@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent bd76b879
Loading
Loading
Loading
Loading
+9 −1
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ struct bpf_map {
	u32 map_flags;
	u32 pages;
	u32 id;
	int numa_node;
	struct user_struct *user;
	const struct bpf_map_ops *ops;
	struct work_struct work;
@@ -264,7 +265,7 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
void *bpf_map_area_alloc(size_t size);
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);

extern int sysctl_unprivileged_bpf_disabled;
@@ -316,6 +317,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);

/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
	return (attr->map_flags & BPF_F_NUMA_NODE) ?
		attr->numa_node : NUMA_NO_NODE;
}

#else
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
+9 −1
Original line number Diff line number Diff line
@@ -165,6 +165,7 @@ enum bpf_attach_type {
#define BPF_NOEXIST	1 /* create new element if it didn't exist */
#define BPF_EXIST	2 /* update existing element */

/* flags for BPF_MAP_CREATE command */
#define BPF_F_NO_PREALLOC	(1U << 0)
/* Instead of having one common LRU list in the
 * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
@@ -173,6 +174,8 @@ enum bpf_attach_type {
 * across different LRU lists.
 */
#define BPF_F_NO_COMMON_LRU	(1U << 1)
/* Specify numa node during map creation */
#define BPF_F_NUMA_NODE		(1U << 2)

union bpf_attr {
	struct { /* anonymous struct used by BPF_MAP_CREATE command */
@@ -180,8 +183,13 @@ union bpf_attr {
		__u32	key_size;	/* size of key in bytes */
		__u32	value_size;	/* size of value in bytes */
		__u32	max_entries;	/* max number of entries in a map */
		__u32	map_flags;	/* prealloc or not */
		__u32	map_flags;	/* BPF_MAP_CREATE related
					 * flags defined above.
					 */
		__u32	inner_map_fd;	/* fd pointing to the inner map */
		__u32	numa_node;	/* numa node (effective only if
					 * BPF_F_NUMA_NODE is set).
					 */
	};

	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+5 −2
Original line number Diff line number Diff line
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
	int numa_node = bpf_map_attr_numa_node(attr);
	struct bpf_array *array;
	u64 array_size;
	u32 elem_size;

	/* check sanity of attributes */
	if (attr->max_entries == 0 || attr->key_size != 4 ||
	    attr->value_size == 0 || attr->map_flags)
	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
	    (percpu && numa_node != NUMA_NO_NODE))
		return ERR_PTR(-EINVAL);

	if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
		return ERR_PTR(-ENOMEM);

	/* allocate all map elements and zero-initialize them */
	array = bpf_map_area_alloc(array_size);
	array = bpf_map_area_alloc(array_size, numa_node);
	if (!array)
		return ERR_PTR(-ENOMEM);

@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
	array->map.value_size = attr->value_size;
	array->map.max_entries = attr->max_entries;
	array->map.map_flags = attr->map_flags;
	array->map.numa_node = numa_node;
	array->elem_size = elem_size;

	if (!percpu)
+6 −3
Original line number Diff line number Diff line
@@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)

	/* check sanity of attributes */
	if (attr->max_entries == 0 || attr->key_size != 4 ||
	    attr->value_size != 4 || attr->map_flags)
	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
		return ERR_PTR(-EINVAL);

	dtab = kzalloc(sizeof(*dtab), GFP_USER);
@@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
	dtab->map.value_size = attr->value_size;
	dtab->map.max_entries = attr->max_entries;
	dtab->map.map_flags = attr->map_flags;
	dtab->map.numa_node = bpf_map_attr_numa_node(attr);

	err = -ENOMEM;

@@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
		goto free_dtab;

	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
					      sizeof(struct bpf_dtab_netdev *));
					      sizeof(struct bpf_dtab_netdev *),
					      dtab->map.numa_node);
	if (!dtab->netdev_map)
		goto free_dtab;

@@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
	if (!ifindex) {
		dev = NULL;
	} else {
		dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
				   map->numa_node);
		if (!dev)
			return -ENOMEM;

+15 −4
Original line number Diff line number Diff line
@@ -18,6 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"

#define HTAB_CREATE_FLAG_MASK \
	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)

struct bucket {
	struct hlist_nulls_head head;
	raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
		num_entries += num_possible_cpus();

	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
					 htab->map.numa_node);
	if (!htab->elems)
		return -ENOMEM;

@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	 */
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
	int numa_node = bpf_map_attr_numa_node(attr);
	struct bpf_htab *htab;
	int err, i;
	u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
		 */
		return ERR_PTR(-EPERM);

	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
		/* reserved bits should not be used */
		return ERR_PTR(-EINVAL);

@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	if (lru && !prealloc)
		return ERR_PTR(-ENOTSUPP);

	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
		return ERR_PTR(-EINVAL);

	htab = kzalloc(sizeof(*htab), GFP_USER);
	if (!htab)
		return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
	htab->map.value_size = attr->value_size;
	htab->map.max_entries = attr->max_entries;
	htab->map.map_flags = attr->map_flags;
	htab->map.numa_node = numa_node;

	/* check sanity of attributes.
	 * value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)

	err = -ENOMEM;
	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
					   sizeof(struct bucket));
					   sizeof(struct bucket),
					   htab->map.numa_node);
	if (!htab->buckets)
		goto free_htab;

@@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
				atomic_dec(&htab->count);
				return ERR_PTR(-E2BIG);
			}
		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
		l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
				     htab->map.numa_node);
		if (!l_new)
			return ERR_PTR(-ENOMEM);
	}
Loading