Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ca822141 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-rbtree-retransmit-queue'



Eric Dumazet says:

====================
tcp: implement rb-tree based retransmit queue

This patch series implement RB-tree based retransmit queue for TCP,
to better match modern BDP.

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87  351.48  339.59  338.62  306.72
204.07  304.93  291.88  202.47  176.88
->   2840

After patch:

1700.83 2207.98 2070.17 1544.26 2114.76
2124.89 1693.14 1080.91 2216.82 1299.94
->  18053

Average of 1805 Mbits istead of 284 Mbits.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f5333f80 75c119af
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
	return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
		for (skb = (queue)->next;					\
		     skb != (struct sk_buff *)(queue);				\
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
		for (; skb != (struct sk_buff *)(queue);			\
		     skb = skb->next)

#define skb_rbtree_walk(skb, root)						\
		for (skb = skb_rb_first(root); skb != NULL;			\
		     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)						\
		for (; skb != NULL;						\
		     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)					\
		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
		     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)				\
		for (tmp = skb->next;						\
		     skb != (struct sk_buff *)(queue);				\
+5 −2
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>

#include <linux/rbtree.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
	int			sk_wmem_queued;
	refcount_t		sk_wmem_alloc;
	unsigned long		sk_tsq_flags;
	union {
		struct sk_buff	*sk_send_head;
		struct rb_root	tcp_rtx_queue;
	};
	struct sk_buff_head	sk_write_queue;
	__s32			sk_peek_off;
	int			sk_write_pending;
+46 −54
Original line number Diff line number Diff line
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
enum tcp_queue {
	TCP_FRAG_IN_WRITE_QUEUE,
	TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
		 struct sk_buff *skb, u32 len,
		 unsigned int mss_now, gfp_t gfp);

void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
@@ -1606,19 +1612,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
	skb->_skb_refdst = _save;		\
}

/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
	struct sk_buff *skb;
void tcp_write_queue_purge(struct sock *sk);

	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
		tcp_skb_tsorted_anchor_cleanup(skb);
		sk_wmem_free_skb(sk, skb);
	}
	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
	sk_mem_reclaim(sk);
	tcp_clear_all_retrans_hints(tcp_sk(sk));
static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
	return skb_rb_first(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
@@ -1643,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
	return skb_queue_prev(&sk->sk_write_queue, skb);
}

#define tcp_for_write_queue(skb, sk)					\
	skb_queue_walk(&(sk)->sk_write_queue, skb)

#define tcp_for_write_queue_from(skb, sk)				\
	skb_queue_walk_from(&(sk)->sk_write_queue, skb)

#define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)

static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
	return sk->sk_send_head;
	return skb_peek(&sk->sk_write_queue);
}

static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1663,27 +1655,28 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
	return skb_queue_is_last(&sk->sk_write_queue, skb);
}

static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
	if (tcp_skb_is_last(sk, skb))
		sk->sk_send_head = NULL;
	else
		sk->sk_send_head = tcp_write_queue_next(sk, skb);
	return skb_queue_empty(&sk->sk_write_queue);
}

static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
	if (sk->sk_send_head == skb_unlinked) {
		sk->sk_send_head = NULL;
		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
	return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}
	if (tcp_sk(sk)->highest_sack == skb_unlinked)
		tcp_sk(sk)->highest_sack = NULL;

static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
	return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}

static inline void tcp_init_send_head(struct sock *sk)
static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
	sk->sk_send_head = NULL;
	if (tcp_write_queue_empty(sk))
		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);

	if (tcp_sk(sk)->highest_sack == skb_unlinked)
		tcp_sk(sk)->highest_sack = NULL;
}

static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
@@ -1696,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
	__tcp_add_write_queue_tail(sk, skb);

	/* Queue it, remembering where we must start sending. */
	if (sk->sk_send_head == NULL) {
		sk->sk_send_head = skb;
	if (sk->sk_write_queue.next == skb) {
		tcp_chrono_start(sk, TCP_CHRONO_BUSY);

		if (tcp_sk(sk)->highest_sack == NULL)
@@ -1710,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
	__skb_queue_head(&sk->sk_write_queue, skb);
}

/* Insert buff after skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
						struct sk_buff *buff,
						struct sock *sk)
{
	__skb_queue_after(&sk->sk_write_queue, skb, buff);
}

/* Insert new before skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
						  struct sk_buff *skb,
						  struct sock *sk)
{
	__skb_queue_before(&sk->sk_write_queue, skb, new);

	if (sk->sk_send_head == skb)
		sk->sk_send_head = new;
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
	list_del(&skb->tcp_tsorted_anchor);
	tcp_skb_tsorted_anchor_cleanup(skb);
	__skb_unlink(skb, &sk->sk_write_queue);
}

static inline bool tcp_write_queue_empty(struct sock *sk)
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);

static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
	return skb_queue_empty(&sk->sk_write_queue);
	tcp_skb_tsorted_anchor_cleanup(skb);
	rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}

static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
	list_del(&skb->tcp_tsorted_anchor);
	tcp_rtx_queue_unlink(skb, sk);
	sk_wmem_free_skb(sk, skb);
}

static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1767,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)

static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
	tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
						tcp_write_queue_next(sk, skb);
	struct sk_buff *next = skb_rb_next(skb);

	tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
}

static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1778,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)

static inline void tcp_highest_sack_reset(struct sock *sk)
{
	tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
	struct sk_buff *skb = tcp_rtx_queue_head(sk);

	tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
}

/* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1948,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
	const struct sk_buff *skb = tcp_write_queue_head(sk);
	const struct sk_buff *skb = tcp_rtx_queue_head(sk);
	u32 rto = inet_csk(sk)->icsk_rto;
	u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);

+51 −12
Original line number Diff line number Diff line
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
	struct tcp_sock *tp = tcp_sk(sk);

	tp->out_of_order_queue = RB_ROOT;
	sk->tcp_rtx_queue = RB_ROOT;
	tcp_init_xmit_timers(sk);
	INIT_LIST_HEAD(&tp->tsq_node);
	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -469,8 +470,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
	tcp_init_buffer_space(sk);
}

static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
{
	struct sk_buff *skb = tcp_write_queue_tail(sk);

	if (tsflags && skb) {
		struct skb_shared_info *shinfo = skb_shinfo(skb);
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -699,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

	if (!tcp_send_head(sk))
		return;

	skb = tcp_write_queue_tail(sk);
	if (!skb)
		return;
	if (!(flags & MSG_MORE) || forced_push(tp))
		tcp_mark_push(tp, skb);

@@ -962,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
		int copy, i;
		bool can_coalesce;

		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
		if (!skb || (copy = size_goal - skb->len) <= 0 ||
		    !tcp_skb_can_collapse_to(skb)) {
new_segment:
			if (!sk_stream_memory_free(sk))
				goto wait_for_sndbuf;

			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
						  skb_queue_empty(&sk->sk_write_queue));
					tcp_rtx_and_write_queues_empty(sk));
			if (!skb)
				goto wait_for_memory;

@@ -1041,7 +1043,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,

out:
	if (copied) {
		tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
		tcp_tx_timestamp(sk, sk->sk_tsflags);
		if (!(flags & MSG_SENDPAGE_NOTLAST))
			tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
	}
@@ -1197,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
			goto out_err;
		}

		skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
		skb = tcp_write_queue_tail(sk);
		uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
		if (!uarg) {
			err = -ENOBUFS;
@@ -1273,7 +1275,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
		int max = size_goal;

		skb = tcp_write_queue_tail(sk);
		if (tcp_send_head(sk)) {
		if (skb) {
			if (skb->ip_summed == CHECKSUM_NONE)
				max = mss_now;
			copy = max - skb->len;
@@ -1293,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
				process_backlog = false;
				goto restart;
			}
			first_skb = skb_queue_empty(&sk->sk_write_queue);
			first_skb = tcp_rtx_and_write_queues_empty(sk);
			skb = sk_stream_alloc_skb(sk,
						  select_size(sk, sg, first_skb),
						  sk->sk_allocation,
@@ -1418,7 +1420,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)

out:
	if (copied) {
		tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
		tcp_tx_timestamp(sk, sockc.tsflags);
		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
	}
out_nopush:
@@ -1519,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)

	/* XXX -- need to support SO_PEEK_OFF */

	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
		if (err)
			return err;
		copied += skb->len;
	}

	skb_queue_walk(&sk->sk_write_queue, skb) {
		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
		if (err)
@@ -2318,6 +2327,37 @@ static inline bool tcp_need_reset(int state)
		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}

static void tcp_rtx_queue_purge(struct sock *sk)
{
	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);

	while (p) {
		struct sk_buff *skb = rb_to_skb(p);

		p = rb_next(p);
		/* Since we are deleting whole queue, no need to
		 * list_del(&skb->tcp_tsorted_anchor)
		 */
		tcp_rtx_queue_unlink(skb, sk);
		sk_wmem_free_skb(sk, skb);
	}
}

void tcp_write_queue_purge(struct sock *sk)
{
	struct sk_buff *skb;

	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
		tcp_skb_tsorted_anchor_cleanup(skb);
		sk_wmem_free_skb(sk, skb);
	}
	tcp_rtx_queue_purge(sk);
	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
	sk_mem_reclaim(sk);
	tcp_clear_all_retrans_hints(tcp_sk(sk));
}

int tcp_disconnect(struct sock *sk, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
@@ -2376,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
	 * issue in __tcp_select_window()
	 */
	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
	tcp_init_send_head(sk);
	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
	__sk_dst_reset(sk);
	dst_release(sk->sk_rx_dst);
+3 −5
Original line number Diff line number Diff line
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct rb_node *p;
	struct sk_buff *skb;
	struct dst_entry *dst;
	struct sk_buff *skb;

	if (!tp->syn_fastopen)
		return;

	if (!tp->data_segs_in) {
		p = rb_first(&tp->out_of_order_queue);
		if (p && !rb_next(p)) {
			skb = rb_entry(p, struct sk_buff, rbnode);
		skb = skb_rb_first(&tp->out_of_order_queue);
		if (skb && !skb_rb_next(skb)) {
			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
				tcp_fastopen_active_disable(sk);
				return;
Loading