Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 25694738 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf_tcp_check_syncookie'

Lorenz Bauer says:

====================
This series adds the necessary helpers to determine wheter a given
(encapsulated) TCP packet belongs to a connection known to the network stack.

* bpf_skc_lookup_tcp gives access to request and timewait sockets
* bpf_tcp_check_syncookie identifies the final 3WHS ACK when syncookies
  are enabled

The goal is to be able to implement load-balancing approaches like
glb-director [1] or Beamer [2] in pure eBPF. Specifically, we'd like to replace
the functionality of the glb-redirect kernel module [3] by an XDP program or
tc classifier.

Changes in v3:
* Fix missing check for ip4->ihl
* Only cast to unsigned long in BPF_CALLs

Changes in v2:
* Rename bpf_sk_check_syncookie to bpf_tcp_check_syncookie.
* Add bpf_skc_lookup_tcp. Without it bpf_tcp_check_syncookie doesn't make sense.
* Check tcp_synq_no_recent_overflow() in bpf_tcp_check_syncookie.
* Check th->syn in bpf_tcp_check_syncookie.
* Require CONFIG_IPV6 to be a built in.

1: https://github.com/github/glb-director
2: https://www.usenix.org/conference/nsdi18/presentation/olteanu
3: https://github.com/github/glb-director/tree/master/src/glb-redirect


====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 48e5d98a bafc0ba8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -205,6 +205,7 @@ enum bpf_return_type {
	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
};

/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
+35 −1
Original line number Diff line number Diff line
@@ -2431,6 +2431,38 @@ union bpf_attr {
 *	Return
 *		A **struct bpf_sock** pointer on success, or **NULL** in
 *		case of failure.
 *
 * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
 *	Description
 *		Look for TCP socket matching *tuple*, optionally in a child
 *		network namespace *netns*. The return value must be checked,
 *		and if non-**NULL**, released via **bpf_sk_release**\ ().
 *
 *		This function is identical to bpf_sk_lookup_tcp, except that it
 *		also returns timewait or request sockets. Use bpf_sk_fullsock
 *		or bpf_tcp_socket to access the full structure.
 *
 *		This helper is available only if the kernel was compiled with
 *		**CONFIG_NET** configuration option.
 *	Return
 *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
 *		For sockets with reuseport option, the **struct bpf_sock**
 *		result is from **reuse->socks**\ [] using the hash of the tuple.
 *
 * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
 * 	Description
 * 		Check whether iph and th contain a valid SYN cookie ACK for
 * 		the listening socket in sk.
 *
 * 		iph points to the start of the IPv4 or IPv6 header, while
 * 		iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
 *
 * 		th points to the start of the TCP header, while th_len contains
 * 		sizeof(struct tcphdr).
 *
 * 	Return
 * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
 * 		otherwise.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -2531,7 +2563,9 @@ union bpf_attr {
	FN(sk_fullsock),		\
	FN(tcp_sock),			\
	FN(skb_ecn_set_ce),		\
	FN(get_listener_sock),
	FN(get_listener_sock),		\
	FN(skc_lookup_tcp),		\
	FN(tcp_check_syncookie),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
+18 −15
Original line number Diff line number Diff line
@@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id)
static bool is_acquire_function(enum bpf_func_id func_id)
{
	return func_id == BPF_FUNC_sk_lookup_tcp ||
		func_id == BPF_FUNC_sk_lookup_udp;
		func_id == BPF_FUNC_sk_lookup_udp ||
		func_id == BPF_FUNC_skc_lookup_tcp;
}

static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -3147,19 +3148,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
		mark_reg_known_zero(env, regs, BPF_REG_0);
		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
		if (is_acquire_function(func_id)) {
			int id = acquire_reference_state(env, insn_idx);

			if (id < 0)
				return id;
			/* For mark_ptr_or_null_reg() */
			regs[BPF_REG_0].id = id;
			/* For release_reference() */
			regs[BPF_REG_0].ref_obj_id = id;
		} else {
			/* For mark_ptr_or_null_reg() */
		regs[BPF_REG_0].id = ++env->id_gen;
		}
	} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
		mark_reg_known_zero(env, regs, BPF_REG_0);
		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
		regs[BPF_REG_0].id = ++env->id_gen;
	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
		mark_reg_known_zero(env, regs, BPF_REG_0);
		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
@@ -3170,9 +3163,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
		return -EINVAL;
	}

	if (is_ptr_cast_function(func_id))
	if (is_ptr_cast_function(func_id)) {
		/* For release_reference() */
		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
	} else if (is_acquire_function(func_id)) {
		int id = acquire_reference_state(env, insn_idx);

		if (id < 0)
			return id;
		/* For mark_ptr_or_null_reg() */
		regs[BPF_REG_0].id = id;
		/* For release_reference() */
		regs[BPF_REG_0].ref_obj_id = id;
	}

	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);

+194 −22
Original line number Diff line number Diff line
@@ -5156,13 +5156,13 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
	return sk;
}

/* bpf_sk_lookup performs the core lookup for different types of sockets,
/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 * Returns the socket as an 'unsigned long' to simplify the casting in the
 * callers to satisfy BPF_CALL declarations.
 */
static unsigned long
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
		 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
		 u64 flags)
{
@@ -5192,14 +5192,26 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
		put_net(net);
	}

out:
	return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
		struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
		u64 flags)
{
	struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
					   ifindex, proto, netns_id, flags);

	if (sk)
		sk = sk_to_full_sk(sk);
out:
	return (unsigned long) sk;

	return sk;
}

static unsigned long
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
	       u8 proto, u64 netns_id, u64 flags)
{
	struct net *caller_net;
@@ -5213,14 +5225,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
		ifindex = 0;
	}

	return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex,
			      proto, netns_id, flags);
	return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
				netns_id, flags);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
	      u8 proto, u64 netns_id, u64 flags)
{
	struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
					 flags);

	if (sk)
		sk = sk_to_full_sk(sk);

	return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
					     netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
	.func		= bpf_skc_lookup_tcp,
	.gpl_only	= false,
	.pkt_access	= true,
	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE,
	.arg4_type	= ARG_ANYTHING,
	.arg5_type	= ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
	return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
					    netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
@@ -5238,7 +5283,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
	return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
					    netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
@@ -5273,8 +5319,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
	struct net *caller_net = dev_net(ctx->rxq->dev);
	int ifindex = ctx->rxq->dev->ifindex;

	return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
			      IPPROTO_UDP, netns_id, flags);
	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
					      ifindex, IPPROTO_UDP, netns_id,
					      flags);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
@@ -5289,14 +5336,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
	.arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
	struct net *caller_net = dev_net(ctx->rxq->dev);
	int ifindex = ctx->rxq->dev->ifindex;

	return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
					       ifindex, IPPROTO_TCP, netns_id,
					       flags);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
	.func           = bpf_xdp_skc_lookup_tcp,
	.gpl_only       = false,
	.pkt_access     = true,
	.ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
	.arg1_type      = ARG_PTR_TO_CTX,
	.arg2_type      = ARG_PTR_TO_MEM,
	.arg3_type      = ARG_CONST_SIZE,
	.arg4_type      = ARG_ANYTHING,
	.arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
	struct net *caller_net = dev_net(ctx->rxq->dev);
	int ifindex = ctx->rxq->dev->ifindex;

	return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
			      IPPROTO_TCP, netns_id, flags);
	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
					      ifindex, IPPROTO_TCP, netns_id,
					      flags);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
@@ -5311,13 +5382,33 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
	.arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
	return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
					       sock_net(ctx->sk), 0,
					       IPPROTO_TCP, netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
	.func		= bpf_sock_addr_skc_lookup_tcp,
	.gpl_only	= false,
	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE,
	.arg4_type	= ARG_ANYTHING,
	.arg5_type	= ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
					      sock_net(ctx->sk), 0, IPPROTO_TCP,
					      netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
	.func		= bpf_sock_addr_sk_lookup_tcp,
	.gpl_only	= false,
@@ -5332,8 +5423,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
	return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
			       IPPROTO_UDP, netns_id, flags);
	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
					      sock_net(ctx->sk), 0, IPPROTO_UDP,
					      netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
@@ -5461,6 +5553,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
	.ret_type       = RET_INTEGER,
	.arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
	   struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
	u32 cookie;
	int ret;

	if (unlikely(th_len < sizeof(*th)))
		return -EINVAL;

	/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
	if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
		return -EINVAL;

	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
		return -EINVAL;

	if (!th->ack || th->rst || th->syn)
		return -ENOENT;

	if (tcp_synq_no_recent_overflow(sk))
		return -ENOENT;

	cookie = ntohl(th->ack_seq) - 1;

	switch (sk->sk_family) {
	case AF_INET:
		if (unlikely(iph_len < sizeof(struct iphdr)))
			return -EINVAL;

		ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
		break;

#if IS_BUILTIN(CONFIG_IPV6)
	case AF_INET6:
		if (unlikely(iph_len < sizeof(struct ipv6hdr)))
			return -EINVAL;

		ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
		break;
#endif /* CONFIG_IPV6 */

	default:
		return -EPROTONOSUPPORT;
	}

	if (ret > 0)
		return 0;

	return -ENOENT;
#else
	return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
	.func		= bpf_tcp_check_syncookie,
	.gpl_only	= true,
	.pkt_access	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE,
	.arg4_type	= ARG_PTR_TO_MEM,
	.arg5_type	= ARG_CONST_SIZE,
};

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(void *func)
@@ -5586,6 +5746,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_sock_addr_sk_lookup_udp_proto;
	case BPF_FUNC_sk_release:
		return &bpf_sk_release_proto;
	case BPF_FUNC_skc_lookup_tcp:
		return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
	default:
		return bpf_base_func_proto(func_id);
@@ -5719,6 +5881,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_tcp_sock_proto;
	case BPF_FUNC_get_listener_sock:
		return &bpf_get_listener_sock_proto;
	case BPF_FUNC_skc_lookup_tcp:
		return &bpf_skc_lookup_tcp_proto;
	case BPF_FUNC_tcp_check_syncookie:
		return &bpf_tcp_check_syncookie_proto;
#endif
	default:
		return bpf_base_func_proto(func_id);
@@ -5754,6 +5920,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_xdp_sk_lookup_tcp_proto;
	case BPF_FUNC_sk_release:
		return &bpf_sk_release_proto;
	case BPF_FUNC_skc_lookup_tcp:
		return &bpf_xdp_skc_lookup_tcp_proto;
	case BPF_FUNC_tcp_check_syncookie:
		return &bpf_tcp_check_syncookie_proto;
#endif
	default:
		return bpf_base_func_proto(func_id);
@@ -5846,6 +6016,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_sk_lookup_udp_proto;
	case BPF_FUNC_sk_release:
		return &bpf_sk_release_proto;
	case BPF_FUNC_skc_lookup_tcp:
		return &bpf_skc_lookup_tcp_proto;
#endif
	default:
		return bpf_base_func_proto(func_id);
+35 −1
Original line number Diff line number Diff line
@@ -2431,6 +2431,38 @@ union bpf_attr {
 *	Return
 *		A **struct bpf_sock** pointer on success, or **NULL** in
 *		case of failure.
 *
 * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
 *	Description
 *		Look for TCP socket matching *tuple*, optionally in a child
 *		network namespace *netns*. The return value must be checked,
 *		and if non-**NULL**, released via **bpf_sk_release**\ ().
 *
 *		This function is identical to bpf_sk_lookup_tcp, except that it
 *		also returns timewait or request sockets. Use bpf_sk_fullsock
 *		or bpf_tcp_socket to access the full structure.
 *
 *		This helper is available only if the kernel was compiled with
 *		**CONFIG_NET** configuration option.
 *	Return
 *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
 *		For sockets with reuseport option, the **struct bpf_sock**
 *		result is from **reuse->socks**\ [] using the hash of the tuple.
 *
 * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
 * 	Description
 * 		Check whether iph and th contain a valid SYN cookie ACK for
 * 		the listening socket in sk.
 *
 * 		iph points to the start of the IPv4 or IPv6 header, while
 * 		iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
 *
 * 		th points to the start of the TCP header, while th_len contains
 * 		sizeof(struct tcphdr).
 *
 * 	Return
 * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
 * 		otherwise.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -2531,7 +2563,9 @@ union bpf_attr {
	FN(sk_fullsock),		\
	FN(tcp_sock),			\
	FN(skb_ecn_set_ce),		\
	FN(get_listener_sock),
	FN(get_listener_sock),		\
	FN(skc_lookup_tcp),		\
	FN(tcp_check_syncookie),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
Loading