Merge branch 'tcp-default-RACK-loss-recovery' (10e361e1) · Commits · e / devices / android_kernel_fairphone_FP5

Documentation/networking/ip-sysctl.txt

+3 −1

Original line number	Diff line number	Diff line
		@@ -449,8 +449,10 @@ tcp_recovery - INTEGER
		features.

		RACK: 0x1 enables the RACK loss detection for fast detection of lost
		retransmissions and tail drops.
		retransmissions and tail drops. It also subsumes and disables
		RFC6675 recovery for SACK connections.
		RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
		RACK: 0x4 disables RACK's DUPACK threshold heuristic

		Default: 0x1

include/net/tcp.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];

		#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
		#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
		#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */

		extern atomic_long_t tcp_memory_allocated;
		extern struct percpu_counter tcp_sockets_allocated;
		@@ -1876,6 +1877,10 @@ void tcp_v4_init(void);
		void tcp_init(void);

		/* tcp_recovery.c */
		void tcp_mark_skb_lost(struct sock sk, struct sk_buff skb);
		void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
		extern s32 tcp_rack_skb_timeout(struct tcp_sock tp, struct sk_buff skb,
		u32 reo_wnd);
		extern void tcp_rack_mark_lost(struct sock *sk);
		extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
		u64 xmit_time);

net/ipv4/tcp_input.c

+53 −46

Original line number	Diff line number	Diff line
		@@ -1917,19 +1917,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
		tp->undo_retrans = tp->retrans_out ? : -1;
		}

		/* Enter Loss state. If we detect SACK reneging, forget all SACK information
		static bool tcp_is_rack(const struct sock *sk)
		{
		return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
		}

		/* If we detect SACK reneging, forget all SACK information
		* and reset tags completely, otherwise preserve SACKs. If receiver
		* dropped its ofo queue, we will know this due to reneging detection.
		*/
		static void tcp_timeout_mark_lost(struct sock *sk)
		{
		struct tcp_sock *tp = tcp_sk(sk);
		struct sk_buff skb, head;
		bool is_reneg; /* is receiver reneging on SACKs? */

		head = tcp_rtx_queue_head(sk);
		is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
		if (is_reneg) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
		tp->sacked_out = 0;
		/* Mark SACK reneging until we recover from this loss event. */
		tp->is_sack_reneg = 1;
		} else if (tcp_is_reno(tp)) {
		tcp_reset_reno_sack(tp);
		}

		skb = head;
		skb_rbtree_walk_from(skb) {
		if (is_reneg)
		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
		else if (tcp_is_rack(sk) && skb != head &&
		tcp_rack_skb_timeout(tp, skb, 0) > 0)
		continue; /* Don't mark recently sent ones lost yet */
		tcp_mark_skb_lost(sk, skb);
		}
		tcp_verify_left_out(tp);
		tcp_clear_all_retrans_hints(tp);
		}

		/* Enter Loss state. */
		void tcp_enter_loss(struct sock *sk)
		{
		const struct inet_connection_sock *icsk = inet_csk(sk);
		struct tcp_sock *tp = tcp_sk(sk);
		struct net *net = sock_net(sk);
		struct sk_buff *skb;
		bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
		bool is_reneg; /* is receiver reneging on SACKs? */
		bool mark_lost;

		tcp_timeout_mark_lost(sk);

		/* Reduce ssthresh if it has not yet been made inside this window. */
		if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
		@@ -1941,40 +1976,10 @@ void tcp_enter_loss(struct sock *sk)
		tcp_ca_event(sk, CA_EVENT_LOSS);
		tcp_init_undo(tp);
		}
		tp->snd_cwnd = 1;
		tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
		tp->snd_cwnd_cnt = 0;
		tp->snd_cwnd_stamp = tcp_jiffies32;

		tp->retrans_out = 0;
		tp->lost_out = 0;

		if (tcp_is_reno(tp))
		tcp_reset_reno_sack(tp);

		skb = tcp_rtx_queue_head(sk);
		is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
		if (is_reneg) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
		tp->sacked_out = 0;
		/* Mark SACK reneging until we recover from this loss event. */
		tp->is_sack_reneg = 1;
		}
		tcp_clear_all_retrans_hints(tp);

		skb_rbtree_walk_from(skb) {
		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
		is_reneg);
		if (mark_lost)
		tcp_sum_lost(tp, skb);
		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)\|TCPCB_SACKED_ACKED;
		if (mark_lost) {
		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
		TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
		tp->lost_out += tcp_skb_pcount(skb);
		}
		}
		tcp_verify_left_out(tp);

		/* Timeout in disordered state after receiving substantial DUPACKs
		* suggests that the degree of reordering is over-estimated.
		*/
		@@ -2141,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
		return true;

		/* Not-A-Trick#2 : Classic rule... */
		if (tcp_dupack_heuristics(tp) > tp->reordering)
		if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
		return true;

		return false;
		@@ -2218,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
		{
		struct tcp_sock *tp = tcp_sk(sk);

		if (tcp_is_reno(tp)) {
		tcp_mark_head_lost(sk, 1, 1);
		} else {
		if (tcp_is_sack(tp)) {
		int sacked_upto = tp->sacked_out - tp->reordering;
		if (sacked_upto >= 0)
		tcp_mark_head_lost(sk, sacked_upto, 0);
		@@ -2718,12 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
		return false;
		}

		static void tcp_rack_identify_loss(struct sock sk, int ack_flag)
		static void tcp_identify_packet_loss(struct sock sk, int ack_flag)
		{
		struct tcp_sock *tp = tcp_sk(sk);

		/* Use RACK to detect loss */
		if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
		if (tcp_rtx_queue_empty(sk))
		return;

		if (unlikely(tcp_is_reno(tp))) {
		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
		} else if (tcp_is_rack(sk)) {
		u32 prior_retrans = tp->retrans_out;

		tcp_rack_mark_lost(sk);
		@@ -2819,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
		tcp_try_keep_open(sk);
		return;
		}
		tcp_rack_identify_loss(sk, ack_flag);
		tcp_identify_packet_loss(sk, ack_flag);
		break;
		case TCP_CA_Loss:
		tcp_process_loss(sk, flag, is_dupack, rexmit);
		tcp_rack_identify_loss(sk, ack_flag);
		tcp_identify_packet_loss(sk, ack_flag);
		if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
		(*ack_flag & FLAG_LOST_RETRANS)))
		return;
		@@ -2840,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
		tcp_try_undo_dsack(sk);

		tcp_rack_identify_loss(sk, ack_flag);
		tcp_identify_packet_loss(sk, ack_flag);
		if (!tcp_time_to_recover(sk, flag)) {
		tcp_try_to_open(sk, flag);
		return;
		@@ -2862,7 +2869,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
		fast_rexmit = 1;
		}

		if (do_lost)
		if (!tcp_is_rack(sk) && do_lost)
		tcp_update_scoreboard(sk, fast_rexmit);
		*rexmit = REXMIT_LOST;
		}

net/ipv4/tcp_recovery.c

+63 −17

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@
		#include <linux/tcp.h>
		#include <net/tcp.h>

		static void tcp_rack_mark_skb_lost(struct sock sk, struct sk_buff skb)
		void tcp_mark_skb_lost(struct sock sk, struct sk_buff skb)
		{
		struct tcp_sock *tp = tcp_sk(sk);

		@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
		return t1 > t2 \|\| (t1 == t2 && after(seq1, seq2));
		}

		u32 tcp_rack_reo_wnd(const struct sock *sk)
		{
		struct tcp_sock *tp = tcp_sk(sk);

		if (!tp->rack.reord) {
		/* If reordering has not been observed, be aggressive during
		* the recovery or starting the recovery by DUPACK threshold.
		*/
		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
		return 0;

		if (tp->sacked_out >= tp->reordering &&
		!(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
		return 0;
		}

		/* To be more reordering resilient, allow min_rtt/4 settling delay.
		* Use min_rtt instead of the smoothed RTT because reordering is
		* often a path property and less related to queuing or delayed ACKs.
		* Upon receiving DSACKs, linearly increase the window up to the
		* smoothed RTT.
		*/
		return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
		tp->srtt_us >> 3);
		}

		s32 tcp_rack_skb_timeout(struct tcp_sock tp, struct sk_buff skb, u32 reo_wnd)
		{
		return tp->rack.rtt_us + reo_wnd -
		tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
		}

		/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
		*
		* Marks a packet lost, if some packet sent later has been (s)acked.
		@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
		static void tcp_rack_detect_loss(struct sock sk, u32 reo_timeout)
		{
		struct tcp_sock *tp = tcp_sk(sk);
		u32 min_rtt = tcp_min_rtt(tp);
		struct sk_buff skb, n;
		u32 reo_wnd;

		*reo_timeout = 0;
		/* To be more reordering resilient, allow min_rtt/4 settling delay
		* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
		* RTT because reordering is often a path property and less related
		* to queuing or delayed ACKs.
		*/
		reo_wnd = 1000;
		if ((tp->rack.reord \|\| inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
		min_rtt != ~0U) {
		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
		}

		reo_wnd = tcp_rack_reo_wnd(sk);
		list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
		tcp_tsorted_anchor) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
		@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock sk, u32 reo_timeout)
		/* A packet is lost if it has not been s/acked beyond
		* the recent RTT plus the reordering window.
		*/
		remaining = tp->rack.rtt_us + reo_wnd -
		tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
		if (remaining <= 0) {
		tcp_rack_mark_skb_lost(sk, skb);
		tcp_mark_skb_lost(sk, skb);
		list_del_init(&skb->tcp_tsorted_anchor);
		} else {
		/* Record maximum wait time */
		@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock sk, struct rate_sample rs)
		tp->rack.reo_wnd_steps = 1;
		}
		}

		/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
		* the next unacked packet upon receiving
		* a) three or more DUPACKs to start the fast recovery
		* b) an ACK acknowledging new data during the fast recovery.
		*/
		void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
		{
		const u8 state = inet_csk(sk)->icsk_ca_state;
		struct tcp_sock *tp = tcp_sk(sk);

		if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) \|\|
		(state == TCP_CA_Recovery && snd_una_advanced)) {
		struct sk_buff *skb = tcp_rtx_queue_head(sk);
		u32 mss;

		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
		return;

		mss = tcp_skb_mss(skb);
		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
		tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
		mss, mss, GFP_ATOMIC);

		tcp_skb_mark_lost_uncond_verify(tp, skb);
		}
		}