Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9f2f27a9 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'icmp-reply-optimize'



Jesper Dangaard Brouer says:

====================
net: optimize ICMP-reply code path

This patchset is optimizing the ICMP-reply code path, for ICMP packets
that gets rate limited. A remote party can easily trigger this code
path by sending packets to port number with no listening service.

Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
checking to earlier in the code path and removes an allocation.

Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.

 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.7 Mpps
Driver mlx5 at 50Gbit/s.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents aaa9c107 7ba91ecb
Loading
Loading
Loading
Loading
+76 −49
Original line number Diff line number Diff line
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
	return *this_cpu_ptr(net->ipv4.icmp_sk);
}

/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
	struct sock *sk;

	local_bh_disable();

	sk = icmp_sk(net);

	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
		/* This can happen if the output path signals a
		 * dst_link_failure() for an outgoing ICMP packet.
		 */
		local_bh_enable();
		return NULL;
	}
	return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)

static inline void icmp_xmit_unlock(struct sock *sk)
{
	spin_unlock_bh(&sk->sk_lock.slock);
	spin_unlock(&sk->sk_lock.slock);
}

int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
}
EXPORT_SYMBOL(icmp_global_allow);

static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
	if (type > NR_ICMP_TYPES)
		return true;

	/* Don't limit PMTU discovery. */
	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
		return true;

	/* Limit if icmp type is enabled in ratemask. */
	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
		return true;

	return false;
}

static bool icmpv4_global_allow(struct net *net, int type, int code)
{
	if (icmpv4_mask_allow(net, type, code))
		return true;

	if (icmp_global_allow())
		return true;

	return false;
}

/*
 *	Send an ICMP frame.
 */
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
			       struct flowi4 *fl4, int type, int code)
{
	struct dst_entry *dst = &rt->dst;
	struct inet_peer *peer;
	bool rc = true;
	int vif;

	if (type > NR_ICMP_TYPES)
		goto out;

	/* Don't limit PMTU discovery. */
	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
	if (icmpv4_mask_allow(net, type, code))
		goto out;

	/* No rate limit on loopback */
	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
		goto out;

	/* Limit if icmp type is enabled in ratemask. */
	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
		goto out;

	rc = false;
	if (icmp_global_allow()) {
		int vif = l3mdev_master_ifindex(dst->dev);
		struct inet_peer *peer;

	vif = l3mdev_master_ifindex(dst->dev);
	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
		rc = inet_peer_xrlim_allow(peer,
					   net->ipv4.sysctl_icmp_ratelimit);
	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
	if (peer)
		inet_putpeer(peer);
	}
out:
	return rc;
}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
	struct inet_sock *inet;
	__be32 daddr, saddr;
	u32 mark = IP4_REPLY_MARK(net, skb->mark);
	int type = icmp_param->data.icmph.type;
	int code = icmp_param->data.icmph.code;

	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
		return;

	/* Needed by both icmp_global_allow and icmp_xmit_lock */
	local_bh_disable();

	/* global icmp_msgs_per_sec */
	if (!icmpv4_global_allow(net, type, code))
		goto out_bh_enable;

	sk = icmp_xmit_lock(net);
	if (!sk)
		return;
		goto out_bh_enable;
	inet = inet_sk(sk);

	icmp_param->data.icmph.checksum = 0;
@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
	rt = ip_route_output_key(net, &fl4);
	if (IS_ERR(rt))
		goto out_unlock;
	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
			       icmp_param->data.icmph.code))
	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
	ip_rt_put(rt);
out_unlock:
	icmp_xmit_unlock(sk);
out_bh_enable:
	local_bh_enable();
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
	struct iphdr *iph;
	int room;
	struct icmp_bxm *icmp_param;
	struct icmp_bxm icmp_param;
	struct rtable *rt = skb_rtable(skb_in);
	struct ipcm_cookie ipc;
	struct flowi4 fl4;
@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
		}
	}

	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
	if (!icmp_param)
		return;
	/* Needed by both icmp_global_allow and icmp_xmit_lock */
	local_bh_disable();

	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
	if (!icmpv4_global_allow(net, type, code))
		goto out_bh_enable;

	sk = icmp_xmit_lock(net);
	if (!sk)
		goto out_free;
		goto out_bh_enable;

	/*
	 *	Construct source address and options.
@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
					  iph->tos;
	mark = IP4_REPLY_MARK(net, skb_in->mark);

	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
		goto out_unlock;


@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
	 *	Prepare data for ICMP header.
	 */

	icmp_param->data.icmph.type	 = type;
	icmp_param->data.icmph.code	 = code;
	icmp_param->data.icmph.un.gateway = info;
	icmp_param->data.icmph.checksum	 = 0;
	icmp_param->skb	  = skb_in;
	icmp_param->offset = skb_network_offset(skb_in);
	icmp_param.data.icmph.type	 = type;
	icmp_param.data.icmph.code	 = code;
	icmp_param.data.icmph.un.gateway = info;
	icmp_param.data.icmph.checksum	 = 0;
	icmp_param.skb	  = skb_in;
	icmp_param.offset = skb_network_offset(skb_in);
	inet_sk(sk)->tos = tos;
	sk->sk_mark = mark;
	ipc.addr = iph->saddr;
	ipc.opt = &icmp_param->replyopts.opt;
	ipc.opt = &icmp_param.replyopts.opt;
	ipc.tx_flags = 0;
	ipc.ttl = 0;
	ipc.tos = -1;

	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
			       type, code, icmp_param);
			       type, code, &icmp_param);
	if (IS_ERR(rt))
		goto out_unlock;

	/* peer icmp_ratelimit */
	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
		goto ende;

@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
	room = dst_mtu(&rt->dst);
	if (room > 576)
		room = 576;
	room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
	room -= sizeof(struct icmphdr);

	icmp_param->data_len = skb_in->len - icmp_param->offset;
	if (icmp_param->data_len > room)
		icmp_param->data_len = room;
	icmp_param->head_len = sizeof(struct icmphdr);
	icmp_param.data_len = skb_in->len - icmp_param.offset;
	if (icmp_param.data_len > room)
		icmp_param.data_len = room;
	icmp_param.head_len = sizeof(struct icmphdr);

	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
	ip_rt_put(rt);
out_unlock:
	icmp_xmit_unlock(sk);
out_free:
	kfree(icmp_param);
out_bh_enable:
	local_bh_enable();
out:;
}
EXPORT_SYMBOL(icmp_send);
+47 −21
Original line number Diff line number Diff line
@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};

/* Called with BH disabled */
static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
{
	struct sock *sk;

	local_bh_disable();

	sk = icmpv6_sk(net);
	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
		/* This can happen if the output path (f.e. SIT or
		 * ip6ip6 tunnel) signals dst_link_failure() for an
		 * outgoing ICMP6 packet.
		 */
		local_bh_enable();
		return NULL;
	}
	return sk;
@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)

static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
{
	spin_unlock_bh(&sk->sk_lock.slock);
	spin_unlock(&sk->sk_lock.slock);
}

/*
@@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
	return false;
}

static bool icmpv6_mask_allow(int type)
{
	/* Informational messages are not limited. */
	if (type & ICMPV6_INFOMSG_MASK)
		return true;

	/* Do not limit pmtu discovery, it would break it. */
	if (type == ICMPV6_PKT_TOOBIG)
		return true;

	return false;
}

static bool icmpv6_global_allow(int type)
{
	if (icmpv6_mask_allow(type))
		return true;

	if (icmp_global_allow())
		return true;

	return false;
}

/*
 * Check the ICMP output rate limit
 */
@@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
	struct dst_entry *dst;
	bool res = false;

	/* Informational messages are not limited. */
	if (type & ICMPV6_INFOMSG_MASK)
		return true;

	/* Do not limit pmtu discovery, it would break it. */
	if (type == ICMPV6_PKT_TOOBIG)
	if (icmpv6_mask_allow(type))
		return true;

	/*
@@ -200,21 +217,17 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
	} else {
		struct rt6_info *rt = (struct rt6_info *)dst;
		int tmo = net->ipv6.sysctl.icmpv6_time;
		struct inet_peer *peer;

		/* Give more bandwidth to wider prefixes. */
		if (rt->rt6i_dst.plen < 128)
			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);

		if (icmp_global_allow()) {
			struct inet_peer *peer;

			peer = inet_getpeer_v6(net->ipv6.peers,
					       &fl6->daddr, 1);
		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
		res = inet_peer_xrlim_allow(peer, tmo);
		if (peer)
			inet_putpeer(peer);
	}
	}
	dst_release(dst);
	return res;
}
@@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
		return;
	}

	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
	local_bh_disable();

	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
	if (!icmpv6_global_allow(type))
		goto out_bh_enable;

	mip6_addr_swap(skb);

	memset(&fl6, 0, sizeof(fl6));
@@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,

	sk = icmpv6_xmit_lock(net);
	if (!sk)
		return;
		goto out_bh_enable;

	sk->sk_mark = mark;
	np = inet6_sk(sk);

@@ -552,6 +573,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
	dst_release(dst);
out:
	icmpv6_xmit_unlock(sk);
out_bh_enable:
	local_bh_enable();
}

/* Slightly more convenient version of icmp6_send.
@@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
	fl6.flowi6_uid = sock_net_uid(net, NULL);
	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));

	local_bh_disable();
	sk = icmpv6_xmit_lock(net);
	if (!sk)
		return;
		goto out_bh_enable;
	sk->sk_mark = mark;
	np = inet6_sk(sk);

@@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
	dst_release(dst);
out:
	icmpv6_xmit_unlock(sk);
out_bh_enable:
	local_bh_enable();
}

void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)