Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 837b9955 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'ip_frag_next'



Florian Westphal says:

====================
net: force refragmentation for DF reassembed skbs

output path tests:

    if (skb->len > mtu) ip_fragment()

This breaks connectivity in one corner case:
 If the skb was reassembled, but has the DF bit set and ..
 .. its reassembled size is <= outdev mtu ..
 .. we will forward a DF packet larger than what the sender
    transmitted on wire.

If a router later in the path can't forward this packet, it will send an
icmp error in response to an mtu that the original sender never exceeded.

This changes ipv4 defrag/output path to

a) force refragmentation for DF reassembled skbs and
b) set DF bit on all fragments when refragmenting if it was set on original
frags.

tested via:
from scapy.all import *
dip="10.23.42.2"
payload="A"*1400
packet=IP(dst=dip,id=12345,flags='DF')/UDP(sport=42,dport=42)/payload
frags=fragment(packet,fragsize=1200)
for fragment in frags:
    send(fragment)

Without this patch, we generate fragments without df bit set based
on the outgoing device mtu when fragmenting after forwarding, ie.

IP (ttl 64, id 12345, offset 0, flags [+, DF], proto UDP (17), length 1204)
    192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400
IP (ttl 64, id 12345, offset 1184, flags [DF], proto UDP (17), length 244)
    192.168.7.1 > 10.23.42.2: ip-proto-17

on ingress will either turn into

IP (ttl 63, id 12345, offset 0, flags [+], proto UDP (17), length 1396)
    192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400
IP (ttl 63, id 12345, offset 1376, flags [none], proto UDP (17), length 52)

(mtu 1400: We strip df and send larger fragment), or

IP (ttl 63, id 12345, offset 0, flags [DF], proto UDP (17), length 1428)
    192.168.7.1.42 > 10.23.42.2.42: [udp sum ok] UDP, length 1400

if mtu is 1500.  And in this case things break; router with a smaller mtu
will send icmp error, but original sender only sent packets <= 1204 byte.

With patch, we keep intent of such fragments and will emit DF-fragments
that won't exceed 1204 byte in size.

Joint work with Hannes Frederic Sowa.

Changes since v2:
 - split unrelated patches from series
 - rework changelog of patch #2 to better illustrate breakage
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8c0ce770 d6b915e2
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -43,7 +43,7 @@ enum {
 * @len: total length of the original datagram
 * @meat: length of received fragments so far
 * @flags: fragment queue flags
 * @max_size: (ipv4 only) maximum received fragment size with IP_DF set
 * @max_size: maximum received fragment size
 * @net: namespace that this frag belongs to
 */
struct inet_frag_queue {
+1 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ struct inet_skb_parm {
#define IPSKB_FRAG_COMPLETE	BIT(3)
#define IPSKB_REROUTED		BIT(4)
#define IPSKB_DOREDIRECT	BIT(5)
#define IPSKB_FRAG_PMTU		BIT(6)

	u16			frag_max_size;
};
+26 −5
Original line number Diff line number Diff line
@@ -75,6 +75,7 @@ struct ipq {
	__be16		id;
	u8		protocol;
	u8		ecn; /* RFC3168 support */
	u16		max_df_size; /* largest frag with DF set seen */
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
@@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
	struct sk_buff *prev, *next;
	struct net_device *dev;
	unsigned int fragsize;
	int flags, offset;
	int ihl, end;
	int err = -ENOENT;
@@ -481,9 +483,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
	if (offset == 0)
		qp->q.flags |= INET_FRAG_FIRST_IN;

	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
	    skb->len + ihl > qp->q.max_size)
		qp->q.max_size = skb->len + ihl;
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;

	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
	    qp->q.meat == qp->q.len) {
@@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
	head->next = NULL;
	head->dev = dev;
	head->tstamp = qp->q.stamp;
	IPCB(head)->frag_max_size = qp->q.max_size;
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);

	iph = ip_hdr(head);
	/* max_size != 0 implies at least one fragment had IP_DF set */
	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
	iph->tot_len = htons(len);
	iph->tos |= ecn;

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
	qp->q.fragments = NULL;
	qp->q.fragments_tail = NULL;
+21 −8
Original line number Diff line number Diff line
@@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);

static int ip_fragment(struct sock *sk, struct sk_buff *skb,
		       unsigned int mtu,
		       int (*output)(struct sock *, struct sk_buff *));

/* Generate a checksum for an outgoing IP datagram. */
@@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
	return -EINVAL;
}

static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
				unsigned int mtu)
{
	netdev_features_t features;
	struct sk_buff *segs;
@@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)

	/* common case: locally created skb or seglen is <= mtu */
	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
	      skb_gso_network_seglen(skb) <= mtu)
		return ip_finish_output2(sk, skb);

	/* Slowpath -  GSO segment length is exceeding the dst MTU.
@@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
		int err;

		segs->next = NULL;
		err = ip_fragment(sk, segs, ip_finish_output2);
		err = ip_fragment(sk, segs, mtu, ip_finish_output2);

		if (err && ret == 0)
			ret = err;
@@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)

static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
	unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	if (skb_dst(skb)->xfrm) {
@@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
		return dst_output_sk(sk, skb);
	}
#endif
	mtu = ip_skb_dst_mtu(skb);
	if (skb_is_gso(skb))
		return ip_finish_output_gso(sk, skb);
		return ip_finish_output_gso(sk, skb, mtu);

	if (skb->len > ip_skb_dst_mtu(skb))
		return ip_fragment(sk, skb, ip_finish_output2);
	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
		return ip_fragment(sk, skb, mtu, ip_finish_output2);

	return ip_finish_output2(sk, skb);
}
@@ -482,12 +487,15 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
}

static int ip_fragment(struct sock *sk, struct sk_buff *skb,
		       unsigned int mtu,
		       int (*output)(struct sock *, struct sk_buff *))
{
	struct iphdr *iph = ip_hdr(skb);
	unsigned int mtu = ip_skb_dst_mtu(skb);

	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
	if ((iph->frag_off & htons(IP_DF)) == 0)
		return ip_do_fragment(sk, skb, output);

	if (unlikely(!skb->ignore_df ||
		     (IPCB(skb)->frag_max_size &&
		      IPCB(skb)->frag_max_size > mtu))) {
		struct rtable *rt = skb_rtable(skb);
@@ -532,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
	iph = ip_hdr(skb);

	mtu = ip_skb_dst_mtu(skb);
	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
		mtu = IPCB(skb)->frag_max_size;

	/*
	 *	Setup starting values.
@@ -727,6 +737,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
		iph = ip_hdr(skb2);
		iph->frag_off = htons((offset >> 3));

		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
			iph->frag_off |= htons(IP_DF);

		/* ANK: dirty, but effective trick. Upgrade options only if
		 * the segment to be fragmented was THE FIRST (otherwise,
		 * options are already fixed) and make it ONCE