Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d83525ca authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by Daniel Borkmann
Browse files

bpf: introduce bpf_spin_lock



Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks

Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.

Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent 1832f4ef
Loading
Loading
Loading
Loading
+34 −3
Original line number Diff line number Diff line
@@ -72,14 +72,15 @@ struct bpf_map {
	u32 value_size;
	u32 max_entries;
	u32 map_flags;
	u32 pages;
	int spin_lock_off; /* >=0 valid offset, <0 error */
	u32 id;
	int numa_node;
	u32 btf_key_type_id;
	u32 btf_value_type_id;
	struct btf *btf;
	u32 pages;
	bool unpriv_array;
	/* 55 bytes hole */
	/* 51 bytes hole */

	/* The 3rd and 4th cacheline with misc members to avoid false sharing
	 * particularly with refcounting.
@@ -91,6 +92,34 @@ struct bpf_map {
	char name[BPF_OBJ_NAME_LEN];
};

static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
	return map->spin_lock_off >= 0;
}

static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
{
	if (likely(!map_value_has_spin_lock(map)))
		return;
	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
		(struct bpf_spin_lock){};
}

/* copy everything but bpf_spin_lock */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
	if (unlikely(map_value_has_spin_lock(map))) {
		u32 off = map->spin_lock_off;

		memcpy(dst, src, off);
		memcpy(dst + off + sizeof(struct bpf_spin_lock),
		       src + off + sizeof(struct bpf_spin_lock),
		       map->value_size - off - sizeof(struct bpf_spin_lock));
	} else {
		memcpy(dst, src, map->value_size);
	}
}

struct bpf_offload_dev;
struct bpf_offloaded_map;

@@ -162,6 +191,7 @@ enum bpf_arg_type {
	ARG_PTR_TO_CTX,		/* pointer to context */
	ARG_ANYTHING,		/* any (initialized) argument is ok */
	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
};

/* type of values returned from helper functions */
@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;

extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;

/* Shared helpers among cBPF and eBPF. */
+1 −0
Original line number Diff line number Diff line
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
	/* call stack tracking */
	struct bpf_func_state *frame[MAX_CALL_FRAMES];
	u32 curframe;
	u32 active_spin_lock;
	bool speculative;
};

+1 −0
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
			   const struct btf_member *m,
			   u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);

#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
+6 −1
Original line number Diff line number Diff line
@@ -2422,7 +2422,9 @@ union bpf_attr {
	FN(map_peek_elem),		\
	FN(msg_push_data),		\
	FN(msg_pop_data),		\
	FN(rc_pointer_rel),
	FN(rc_pointer_rel),		\
	FN(spin_lock),			\
	FN(spin_unlock),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
@@ -3056,4 +3058,7 @@ struct bpf_line_info {
	__u32	line_col;
};

struct bpf_spin_lock {
	__u32	val;
};
#endif /* _UAPI__LINUX_BPF_H__ */
+3 −0
Original line number Diff line number Diff line
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
	depends on SMP

config BPF_ARCH_SPINLOCK
	bool

config ARCH_USE_QUEUED_RWLOCKS
	bool

Loading