Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4f693b55 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

tcp: implement coalescing on backlog queue



In case GRO is not as efficient as it should be or disabled,
we might have a user thread trapped in __release_sock() while
softirq handler flood packets up to the point we have to drop.

This patch balances work done from user thread and softirq,
to give more chances to __release_sock() to complete its work
before new packets are added the the backlog.

This also helps if we receive many ACK packets, since GRO
does not aggregate them.

This patch brings ~60% throughput increase on a receiver
without GRO, but the spectacular gain is really on
1000x release_sock() latency reduction I have measured.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 85bdf7db
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -243,6 +243,7 @@ enum
	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
	LINUX_MIB_TCPRCVCOALESCE,		/* TCPRcvCoalesce */
	LINUX_MIB_TCPBACKLOGCOALESCE,		/* TCPBacklogCoalesce */
	LINUX_MIB_TCPOFOQUEUE,			/* TCPOFOQueue */
	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
+1 −0
Original line number Diff line number Diff line
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
	SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+86 −6
Original line number Diff line number Diff line
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;

	/* Only socket owner can try to collapse/prune rx queues
	 * to reduce memory overhead, so add a little headroom here.
	 * Few sockets backlog are possibly concurrently non empty.
	 */
	limit += 64*1024;
	struct skb_shared_info *shinfo;
	const struct tcphdr *th;
	struct tcphdr *thtail;
	struct sk_buff *tail;
	unsigned int hdrlen;
	bool fragstolen;
	u32 gso_segs;
	int delta;

	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
	 * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)

	skb_dst_drop(skb);

	if (unlikely(tcp_checksum_complete(skb))) {
		bh_unlock_sock(sk);
		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		return true;
	}

	/* Attempt coalescing to last skb in backlog, even if we are
	 * above the limits.
	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
	 */
	th = (const struct tcphdr *)skb->data;
	hdrlen = th->doff * 4;
	shinfo = skb_shinfo(skb);

	if (!shinfo->gso_size)
		shinfo->gso_size = skb->len - hdrlen;

	if (!shinfo->gso_segs)
		shinfo->gso_segs = 1;

	tail = sk->sk_backlog.tail;
	if (!tail)
		goto no_coalesce;
	thtail = (struct tcphdr *)tail->data;

	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
	    ((TCP_SKB_CB(tail)->tcp_flags |
	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
	    ((TCP_SKB_CB(tail)->tcp_flags ^
	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
	    tail->decrypted != skb->decrypted ||
#endif
	    thtail->doff != th->doff ||
	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
		goto no_coalesce;

	__skb_pull(skb, hdrlen);
	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
		thtail->window = th->window;

		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;

		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;

		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;

		if (TCP_SKB_CB(skb)->has_rxtstamp) {
			TCP_SKB_CB(tail)->has_rxtstamp = true;
			tail->tstamp = skb->tstamp;
			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
		}

		/* Not as strict as GRO. We only need to carry mss max value */
		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
						 skb_shinfo(tail)->gso_size);

		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);

		sk->sk_backlog.len += delta;
		__NET_INC_STATS(sock_net(sk),
				LINUX_MIB_TCPBACKLOGCOALESCE);
		kfree_skb_partial(skb, fragstolen);
		return false;
	}
	__skb_push(skb, hdrlen);

no_coalesce:
	/* Only socket owner can try to collapse/prune rx queues
	 * to reduce memory overhead, so add a little headroom here.
	 * Few sockets backlog are possibly concurrently non empty.
	 */
	limit += 64*1024;

	if (unlikely(sk_add_backlog(sk, skb, limit))) {
		bh_unlock_sock(sk);
		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);