Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 629a0025 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf-tc-tunneling'



Willem de Bruijn says:

====================
BPF allows for dynamic tunneling, choosing the tunnel destination and
features on-demand. Extend bpf_skb_adjust_room to allow for efficient
tunneling at the TC hooks.

Most features are required for large packets with GSO, as these will
be modified after this patch.

Patch 1
  is a performance optimization, avoiding an unnecessary unclone
  for the TCP hot path.

Patches 2..6
  introduce a regression test. These can be squashed, but the code is
  arguably more readable when gradually expanding the feature set.

Patch 7
  is a performance optimization, avoid copying network headers
  that are going to be overwritten. This also simplifies the bpf
  program.

Patch 8
  reenables bpf_skb_adjust_room for UDP packets.

Patch 9
  configures skb tunneling metadata analogous to tunnel devices.

Patches 10..13
  expand the regression test to make use of the new features and
  enable the GSO testcases.

Changes
  v1->v2
  - move BPF_F_ADJ_ROOM_MASK out of uapi as it can be expanded
  - document new flags
  - in tests replace netcat -q flag with coreutils timeout:
      the -q flag is not supported in all netcat versions
  v2->v3
  - move BPF_F_ADJ_ROOM_ENCAP_L3_MASK out of uapi as it has no
    use in userspace
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents f6827526 75a1a9fa
Loading
Loading
Loading
Loading
+26 −3
Original line number Diff line number Diff line
@@ -1478,13 +1478,27 @@ union bpf_attr {
 * 		Grow or shrink the room for data in the packet associated to
 * 		*skb* by *len_diff*, and according to the selected *mode*.
 *
 * 		There is a single supported mode at this time:
 *		There are two supported modes at this time:
 *
 *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
 *		  (room space is added or removed below the layer 2 header).
 *
 * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
 * 		  (room space is added or removed below the layer 3 header).
 *
 * 		All values for *flags* are reserved for future usage, and must
 * 		be left at zero.
 *		The following flags are supported at this time:
 *
 *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
 *		  Adjusting mss in this way is not allowed for datagrams.
 *
 *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
 *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
 *		  Any new space is reserved to hold a tunnel header.
 *		  Configure skb offsets and other fields accordingly.
 *
 *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
 *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
 *		  Use with ENCAP_L3 flags to further specify the tunnel type.
 *
 * 		A call to this helper is susceptible to change the underlaying
 * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2624,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */
#define BPF_F_CURRENT_NETNS		(-1L)

/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)

#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)

/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
	BPF_ADJ_ROOM_NET,
	BPF_ADJ_ROOM_MAC,
};

/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+103 −29
Original line number Diff line number Diff line
@@ -2963,42 +2963,113 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
	}
}

static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK	(BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
					 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO | \
					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP)

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
			    u64 flags)
{
	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
	bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
	unsigned int gso_type = SKB_GSO_DODGY;
	u16 mac_len, inner_net, inner_trans;
	int ret;

	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
		/* udp gso_size delineates datagrams, only allow if fixed */
		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
		    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
			return -ENOTSUPP;
	}

	ret = skb_cow(skb, len_diff);
	ret = skb_cow_head(skb, len_diff);
	if (unlikely(ret < 0))
		return ret;

	if (encap) {
		if (skb->protocol != htons(ETH_P_IP) &&
		    skb->protocol != htons(ETH_P_IPV6))
			return -ENOTSUPP;

		if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
		    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
			return -EINVAL;

		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
			return -EINVAL;

		if (skb->encapsulation)
			return -EALREADY;

		mac_len = skb->network_header - skb->mac_header;
		inner_net = skb->network_header;
		inner_trans = skb->transport_header;
	}

	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
	if (unlikely(ret < 0))
		return ret;

	if (encap) {
		/* inner mac == inner_net on l3 encap */
		skb->inner_mac_header = inner_net;
		skb->inner_network_header = inner_net;
		skb->inner_transport_header = inner_trans;
		skb_set_inner_protocol(skb, skb->protocol);

		skb->encapsulation = 1;
		skb_set_network_header(skb, mac_len);

		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
			gso_type |= SKB_GSO_UDP_TUNNEL;
		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
			gso_type |= SKB_GSO_GRE;
		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
			gso_type |= SKB_GSO_IPXIP6;
		else
			gso_type |= SKB_GSO_IPXIP4;

		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
			int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
					sizeof(struct ipv6hdr) :
					sizeof(struct iphdr);

			skb_set_transport_header(skb, mac_len + nh_len);
		}
	}

	if (skb_is_gso(skb)) {
		struct skb_shared_info *shinfo = skb_shinfo(skb);

		/* Due to header grow, MSS needs to be downgraded. */
		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
			skb_decrease_gso_size(shinfo, len_diff);

		/* Header must be checked, and gso_segs recomputed. */
		shinfo->gso_type |= SKB_GSO_DODGY;
		shinfo->gso_type |= gso_type;
		shinfo->gso_segs = 0;
	}

	return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
			      u64 flags)
{
	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
	int ret;

	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
		/* udp gso_size delineates datagrams, only allow if fixed */
		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
		    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
			return -ENOTSUPP;
	}

	ret = skb_unclone(skb, GFP_ATOMIC);
	if (unlikely(ret < 0))
@@ -3012,7 +3083,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
		struct skb_shared_info *shinfo = skb_shinfo(skb);

		/* Due to header shrink, MSS can be upgraded. */
		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
			skb_increase_gso_size(shinfo, len_diff);

		/* Header must be checked, and gso_segs recomputed. */
		shinfo->gso_type |= SKB_GSO_DODGY;
		shinfo->gso_segs = 0;
@@ -3027,49 +3100,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
			  SKB_MAX_ALLOC;
}

static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
	   u32, mode, u64, flags)
{
	bool trans_same = skb->transport_header == skb->network_header;
	u32 len_cur, len_diff_abs = abs(len_diff);
	u32 len_min = bpf_skb_net_base_len(skb);
	u32 len_max = __bpf_skb_max_len(skb);
	__be16 proto = skb->protocol;
	bool shrink = len_diff < 0;
	u32 off;
	int ret;

	if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
		return -EINVAL;
	if (unlikely(len_diff_abs > 0xfffU))
		return -EFAULT;
	if (unlikely(proto != htons(ETH_P_IP) &&
		     proto != htons(ETH_P_IPV6)))
		return -ENOTSUPP;

	off = skb_mac_header_len(skb);
	switch (mode) {
	case BPF_ADJ_ROOM_NET:
		off += bpf_skb_net_base_len(skb);
		break;
	case BPF_ADJ_ROOM_MAC:
		break;
	default:
		return -ENOTSUPP;
	}

	len_cur = skb->len - skb_network_offset(skb);
	if (skb_transport_header_was_set(skb) && !trans_same)
		len_cur = skb_network_header_len(skb);
	if ((shrink && (len_diff_abs >= len_cur ||
			len_cur - len_diff_abs < len_min)) ||
	    (!shrink && (skb->len + len_diff_abs > len_max &&
			 !skb_is_gso(skb))))
		return -ENOTSUPP;

	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
		       bpf_skb_net_grow(skb, len_diff_abs);
	ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
		       bpf_skb_net_grow(skb, off, len_diff_abs, flags);

	bpf_compute_data_pointers(skb);
	return ret;
}

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
	   u32, mode, u64, flags)
{
	if (unlikely(flags))
		return -EINVAL;
	if (likely(mode == BPF_ADJ_ROOM_NET))
		return bpf_skb_adjust_net(skb, len_diff);

	return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
	.func		= bpf_skb_adjust_room,
	.gpl_only	= false,
+26 −3
Original line number Diff line number Diff line
@@ -1478,13 +1478,27 @@ union bpf_attr {
 * 		Grow or shrink the room for data in the packet associated to
 * 		*skb* by *len_diff*, and according to the selected *mode*.
 *
 * 		There is a single supported mode at this time:
 *		There are two supported modes at this time:
 *
 *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
 *		  (room space is added or removed below the layer 2 header).
 *
 * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
 * 		  (room space is added or removed below the layer 3 header).
 *
 * 		All values for *flags* are reserved for future usage, and must
 * 		be left at zero.
 *		The following flags are supported at this time:
 *
 *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
 *		  Adjusting mss in this way is not allowed for datagrams.
 *
 *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
 *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
 *		  Any new space is reserved to hold a tunnel header.
 *		  Configure skb offsets and other fields accordingly.
 *
 *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
 *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
 *		  Use with ENCAP_L3 flags to further specify the tunnel type.
 *
 * 		A call to this helper is susceptible to change the underlaying
 * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2624,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */
#define BPF_F_CURRENT_NETNS		(-1L)

/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)

#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)

/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
	BPF_ADJ_ROOM_NET,
	BPF_ADJ_ROOM_MAC,
};

/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+2 −1
Original line number Diff line number Diff line
@@ -52,7 +52,8 @@ TEST_PROGS := test_kmod.sh \
	test_flow_dissector.sh \
	test_xdp_vlan.sh \
	test_lwt_ip_encap.sh \
	test_tcp_check_syncookie.sh
	test_tcp_check_syncookie.sh \
	test_tc_tunnel.sh

TEST_PROGS_EXTENDED := with_addr.sh \
	with_tunnels.sh \
+2 −0
Original line number Diff line number Diff line
@@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y
CONFIG_BPF_STREAM_PARSER=y
CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
Loading