Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8387fbac authored by Sabrina Dubroca's avatar Sabrina Dubroca Committed by Greg Kroah-Hartman
Browse files

ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu



[ Upstream commit d52e5a7e7ca49457dd31fc8b42fb7c0d58a31221 ]

Prior to the rework of PMTU information storage in commit
2c8cec5c ("ipv4: Cache learned PMTU information in inetpeer."),
when a PMTU event advertising a PMTU smaller than
net.ipv4.route.min_pmtu was received, we would disable setting the DF
flag on packets by locking the MTU metric, and set the PMTU to
net.ipv4.route.min_pmtu.

Since then, we don't disable DF, and set PMTU to
net.ipv4.route.min_pmtu, so the intermediate router that has this link
with a small MTU will have to drop the packets.

This patch reestablishes pre-2.6.39 behavior by splitting
rtable->rt_pmtu into a bitfield with rt_mtu_locked and rt_pmtu.
rt_mtu_locked indicates that we shouldn't set the DF bit on that path,
and is checked in ip_dont_fragment().

One possible workaround is to set net.ipv4.route.min_pmtu to a value low
enough to accommodate the lowest MTU encountered.

Fixes: 2c8cec5c ("ipv4: Cache learned PMTU information in inetpeer.")
Signed-off-by: default avatarSabrina Dubroca <sd@queasysnail.net>
Reviewed-by: default avatarStefano Brivio <sbrivio@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarSasha Levin <alexander.levin@microsoft.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 7c84e5e9
Loading
Loading
Loading
Loading
+9 −2
Original line number Diff line number Diff line
@@ -326,6 +326,13 @@ int ip_decrease_ttl(struct iphdr *iph)
	return --iph->ttl;
}

static inline int ip_mtu_locked(const struct dst_entry *dst)
{
	const struct rtable *rt = (const struct rtable *)dst;

	return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
}

static inline
int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
{
@@ -333,7 +340,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)

	return  pmtudisc == IP_PMTUDISC_DO ||
		(pmtudisc == IP_PMTUDISC_WANT &&
		 !(dst_metric_locked(dst, RTAX_MTU)));
		 !ip_mtu_locked(dst));
}

static inline bool ip_sk_accept_pmtu(const struct sock *sk)
@@ -359,7 +366,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
	struct net *net = dev_net(dst->dev);

	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
	    dst_metric_locked(dst, RTAX_MTU) ||
	    ip_mtu_locked(dst) ||
	    !forwarding)
		return dst_mtu(dst);

+1 −0
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@ struct fib_nh_exception {
	int				fnhe_genid;
	__be32				fnhe_daddr;
	u32				fnhe_pmtu;
	bool				fnhe_mtu_locked;
	__be32				fnhe_gw;
	unsigned long			fnhe_expires;
	struct rtable __rcu		*fnhe_rth_input;
+2 −1
Original line number Diff line number Diff line
@@ -63,7 +63,8 @@ struct rtable {
	__be32			rt_gateway;

	/* Miscellaneous cached information */
	u32			rt_pmtu;
	u32			rt_mtu_locked:1,
				rt_pmtu:31;

	u32			rt_table_id;

+19 −7
Original line number Diff line number Diff line
@@ -639,6 +639,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
	rt->rt_pmtu = fnhe->fnhe_pmtu;
	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
	rt->dst.expires = fnhe->fnhe_expires;

	if (fnhe->fnhe_gw) {
@@ -649,7 +650,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
}

static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
				  u32 pmtu, unsigned long expires)
				  u32 pmtu, bool lock, unsigned long expires)
{
	struct fnhe_hash_bucket *hash;
	struct fib_nh_exception *fnhe;
@@ -686,8 +687,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
			fnhe->fnhe_genid = genid;
		if (gw)
			fnhe->fnhe_gw = gw;
		if (pmtu)
		if (pmtu) {
			fnhe->fnhe_pmtu = pmtu;
			fnhe->fnhe_mtu_locked = lock;
		}
		fnhe->fnhe_expires = max(1UL, expires);
		/* Update all cached dsts too */
		rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -711,6 +714,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
		fnhe->fnhe_daddr = daddr;
		fnhe->fnhe_gw = gw;
		fnhe->fnhe_pmtu = pmtu;
		fnhe->fnhe_mtu_locked = lock;
		fnhe->fnhe_expires = max(1UL, expires);

		/* Exception created; mark the cached routes for the nexthop
@@ -792,7 +796,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
				struct fib_nh *nh = &FIB_RES_NH(res);

				update_or_create_fnhe(nh, fl4->daddr, new_gw,
						0, jiffies + ip_rt_gc_timeout);
						0, false,
						jiffies + ip_rt_gc_timeout);
			}
			if (kill_route)
				rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1005,15 +1010,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
	struct dst_entry *dst = &rt->dst;
	struct fib_result res;
	bool lock = false;

	if (dst_metric_locked(dst, RTAX_MTU))
	if (ip_mtu_locked(dst))
		return;

	if (ipv4_mtu(dst) < mtu)
		return;

	if (mtu < ip_rt_min_pmtu)
	if (mtu < ip_rt_min_pmtu) {
		lock = true;
		mtu = ip_rt_min_pmtu;
	}

	if (rt->rt_pmtu == mtu &&
	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
@@ -1023,7 +1031,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
		struct fib_nh *nh = &FIB_RES_NH(res);

		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
				      jiffies + ip_rt_mtu_expires);
	}
	rcu_read_unlock();
@@ -1276,7 +1284,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)

	mtu = READ_ONCE(dst->dev->mtu);

	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
	if (unlikely(ip_mtu_locked(dst))) {
		if (rt->rt_uses_gateway && mtu > 576)
			mtu = 576;
	}
@@ -1548,6 +1556,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
		rt->rt_is_input = 0;
		rt->rt_iif = 0;
		rt->rt_pmtu = 0;
		rt->rt_mtu_locked = 0;
		rt->rt_gateway = 0;
		rt->rt_uses_gateway = 0;
		rt->rt_table_id = 0;
@@ -2526,6 +2535,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
		rt->rt_is_input = ort->rt_is_input;
		rt->rt_iif = ort->rt_iif;
		rt->rt_pmtu = ort->rt_pmtu;
		rt->rt_mtu_locked = ort->rt_mtu_locked;

		rt->rt_genid = rt_genid_ipv4(net);
		rt->rt_flags = ort->rt_flags;
@@ -2628,6 +2638,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
	if (rt->rt_pmtu && expires)
		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
	if (rt->rt_mtu_locked && expires)
		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
	if (rtnetlink_put_metrics(skb, metrics) < 0)
		goto nla_put_failure;

+1 −0
Original line number Diff line number Diff line
@@ -100,6 +100,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
	xdst->u.rt.rt_gateway = rt->rt_gateway;
	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
	xdst->u.rt.rt_table_id = rt->rt_table_id;
	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);