Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 718e14bb authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-RACK-fast-recovery'

Yuchung Cheng says:

====================
tcp: RACK fast recovery

The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
to trigger fast recovery with a reordering timer.

Previously RACK has been running in auxiliary mode where it is
used to detect packet losses once the recovery has triggered by
other algorithms (e.g., FACK). By inspecting packet timestamps,
RACK can start ACK-driven repairs timely. A few similar heuristics
are no longer needed and are either removed or disabled to reduce
the complexity of the Linux TCP loss recovery engine:

  1. FACK (Forward Acknowledgement)
  2. Early Retransmit (RFC5827)
  3. thin_dupack (fast recovery on single DUPACK for thin-streams)
  4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
  5. Forward Retransmit

After this change, Linux's loss recovery algorithms consist of
  1. Conventional DUPACK threshold approach (RFC6675)
  2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
  3. RTO plus F-RTO extension (RFC5682)

The patch set has been tested on Google servers extensively and
presented in several IETF meetings. The data suggests that RACK
successfully improves recovery performance:
https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7410191a 94bdc978
Loading
Loading
Loading
Loading
+5 −26
Original line number Diff line number Diff line
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
	Allows TCP to send "duplicate" SACKs.

tcp_early_retrans - INTEGER
	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
	for triggering fast retransmit when the amount of outstanding data is
	small and when no previously unsent data can be transmitted (such
	that limited transmit could be used). Also controls the use of
	Tail loss probe (TLP) that converts RTOs occurring due to tail
	losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
	Tail loss probe (TLP) converts RTOs occurring due to tail
	losses into fast recovery (draft-ietf-tcpm-rack). Note that
	TLP requires RACK to function properly (see tcp_recovery below)
	Possible values:
		0 disables ER
		1 enables ER
		2 enables ER but delays fast recovery and fast retransmit
		  by a fourth of RTT. This mitigates connection falsely
		  recovers when network has a small degree of reordering
		  (less than 3 packets).
		3 enables delayed ER and TLP.
		4 enables TLP only.
		0 disables TLP
		3 or 4 enables TLP
	Default: 3

tcp_ecn - INTEGER
@@ -712,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
	Documentation/networking/tcp-thin.txt
	Default: 0

tcp_thin_dupack - BOOLEAN
	Enable dynamic triggering of retransmissions after one dupACK
	for thin streams. If set, a check is performed upon reception
	of a dupACK to determine if the stream is thin (less than 4
	packets in flight). As long as the stream is found to be thin,
	data is retransmitted on the first received dupACK. This
	improves retransmission latency for non-aggressive thin
	streams, often found to be time-dependent.
	For more information on thin streams, see
	Documentation/networking/tcp-thin.txt
	Default: 0

tcp_limit_output_bytes - INTEGER
	Controls TCP Small Queue limit per tcp socket.
	TCP bulk sender tends to increase packets in flight until it
+4 −4
Original line number Diff line number Diff line
@@ -207,6 +207,8 @@ struct tcp_sock {
	/* Information of the most recently (s)acked skb */
	struct tcp_rack {
		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
		u32 rtt_us;  /* Associated RTT */
		u32 end_seq; /* Ending TCP sequence of the skb */
		u8 advanced; /* mstamp advanced since last lost marking */
		u8 reord;    /* reordering detected */
	} rack;
@@ -218,12 +220,11 @@ struct tcp_sock {
		unused:5;
	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
		thin_lto    : 1,/* Use linear timeouts for thin streams */
		thin_dupack : 1,/* Fast retransmit on first dupack      */
		unused1	    : 1,
		repair      : 1,
		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
	u8	repair_queue;
	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
		syn_data:1,	/* SYN includes data */
	u8	syn_data:1,	/* SYN includes data */
		syn_fastopen:1,	/* SYN includes Fast Open option */
		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
@@ -305,7 +306,6 @@ struct tcp_sock {
					 */

	int     lost_cnt_hint;
	u32     retransmit_high;	/* L-bits may be on up to this seqno */

	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
	u32	high_seq;	/* snd_nxt at onset of congestion	*/
+3 −1
Original line number Diff line number Diff line
@@ -144,6 +144,7 @@ struct inet_connection_sock {
#define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
#define ICSK_TIME_REO_TIMEOUT	6	/* Reordering timer */

static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
	}

	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
	    what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
	    what == ICSK_TIME_REO_TIMEOUT) {
		icsk->icsk_pending = what;
		icsk->icsk_timeout = jiffies + when;
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+11 −29
Original line number Diff line number Diff line
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
					                 * for local resources.
					                 */
#define TCP_REO_TIMEOUT_MIN	(2000) /* Min RACK reordering timeout in usec */

#define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
#define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
@@ -261,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_thin_linear_timeouts;
extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_recovery;
#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */

extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
extern int sysctl_tcp_min_tso_segs;
@@ -397,6 +401,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
int tcp_child_process(struct sock *parent, struct sock *child,
		      struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
@@ -541,6 +546,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);

@@ -559,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
			     const struct sk_buff *next_skb);

/* tcp_input.c */
void tcp_resume_early_retransmit(struct sock *sk);
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk);
@@ -1031,23 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
}

/* TCP early-retransmit (ER) is similar to but more conservative than
 * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
 */
static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
{
	struct net *net = sock_net((struct sock *)tp);

	tp->do_early_retrans = sysctl_tcp_early_retrans &&
		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
		net->ipv4.sysctl_tcp_reordering == 3;
}

static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
{
	tp->do_early_retrans = 0;
}

static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
	return tp->sacked_out + tp->lost_out;
@@ -1856,17 +1844,11 @@ void tcp_v4_init(void);
void tcp_init(void);

/* tcp_recovery.c */

/* Flags to enable various loss recovery features. See below */
extern int sysctl_tcp_recovery;

/* Use TCP RACK to detect (some) tail and retransmit losses */
#define TCP_RACK_LOST_RETRANS  0x1

extern int tcp_rack_mark_lost(struct sock *sk);

extern void tcp_rack_advance(struct tcp_sock *tp,
			     const struct skb_mstamp *xmit_time, u8 sacked);
extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
			     const struct skb_mstamp *xmit_time,
			     const struct skb_mstamp *ack_time);
extern void tcp_rack_reo_timeout(struct sock *sk);

/*
 * Save and compile IPv4 options, return a pointer to it
+1 −1
Original line number Diff line number Diff line
@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
	}

	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
		r->idiag_timer = 1;
		r->idiag_retrans = icsk->icsk_retransmits;
Loading