Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 337f1b07 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-xmit-timer-rearming'



Neal Cardwell says:

====================
tcp: fix xmit timer rearming to avoid stalls

This patch series is a bug fix for a TCP loss recovery performance bug
reported independently in recent netdev threads:

 (i)  July 26, 2017: netdev thread "TCP fast retransmit issues"
 (ii) July 26, 2017: netdev thread:
       "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one
       outstanding TLP retransmission"

Many thanks to Klavs Klavsen and Mao Wenan for the detailed reports,
traces, and packetdrill test cases, which enabled us to root-cause
this issue and verify the fix.

- v1 -> v2:
 - In patch 2/3, changed an unclear comment in the pre-existing code
   in tcp_schedule_loss_probe() to be more clear (thanks to Eric Dumazet
   for suggesting we improve this).
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b91d5329 df92c839
Loading
Loading
Loading
Loading
+10 −0
Original line number Original line Diff line number Diff line
@@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
			     u64 xmit_time);
			     u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
extern void tcp_rack_reo_timeout(struct sock *sk);


/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
	const struct sk_buff *skb = tcp_write_queue_head(sk);
	u32 rto = inet_csk(sk)->icsk_rto;
	u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);

	return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
}

/*
/*
 * Save and compile IPv4 options, return a pointer to it
 * Save and compile IPv4 options, return a pointer to it
 */
 */
+17 −13
Original line number Original line Diff line number Diff line
@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SET_XMIT_TIMER	0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
@@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk)
		/* Offset the time elapsed after installing regular RTO */
		/* Offset the time elapsed after installing regular RTO */
		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
			struct sk_buff *skb = tcp_write_queue_head(sk);
			s64 delta_us = tcp_rto_delta_us(sk);
			u64 rto_time_stamp = skb->skb_mstamp +
					     jiffies_to_usecs(rto);
			s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
			/* delta_us may not be positive if the socket is locked
			/* delta_us may not be positive if the socket is locked
			 * when the retrans timer fires and is rescheduled.
			 * when the retrans timer fires and is rescheduled.
			 */
			 */
@@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
	}
	}
}
}


/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
	if (!tcp_schedule_loss_probe(sk))
		tcp_rearm_rto(sk);
}

/* If we get here, the whole TSO packet has not been acked. */
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
{
@@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
					ca_rtt_us, sack->rate);
					ca_rtt_us, sack->rate);


	if (flag & FLAG_ACKED) {
	if (flag & FLAG_ACKED) {
		tcp_rearm_rto(sk);
		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
		if (unlikely(icsk->icsk_mtup.probe_size &&
		if (unlikely(icsk->icsk_mtup.probe_size &&
			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
			tcp_mtup_probe_success(sk);
			tcp_mtup_probe_success(sk);
@@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
		 * after when the head was last (re)transmitted. Otherwise the
		 * after when the head was last (re)transmitted. Otherwise the
		 * timeout may continue to extend in loss recovery.
		 * timeout may continue to extend in loss recovery.
		 */
		 */
		tcp_rearm_rto(sk);
		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
	}
	}


	if (icsk->icsk_ca_ops->pkts_acked) {
	if (icsk->icsk_ca_ops->pkts_acked) {
@@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
	if (after(ack, tp->snd_nxt))
	if (after(ack, tp->snd_nxt))
		goto invalid_ack;
		goto invalid_ack;


	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
		tcp_rearm_rto(sk);

	if (after(ack, prior_snd_una)) {
	if (after(ack, prior_snd_una)) {
		flag |= FLAG_SND_UNA_ADVANCED;
		flag |= FLAG_SND_UNA_ADVANCED;
		icsk->icsk_retransmits = 0;
		icsk->icsk_retransmits = 0;
@@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
				    &sack_state);
				    &sack_state);


	if (tp->tlp_high_seq)
		tcp_process_tlp_ack(sk, ack, flag);
	/* If needed, reset TLP/RTO timer; RACK may later override this. */
	if (flag & FLAG_SET_XMIT_TIMER)
		tcp_set_xmit_timer(sk);

	if (tcp_ack_is_dubious(sk, flag)) {
	if (tcp_ack_is_dubious(sk, flag)) {
		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
	}
	}
	if (tp->tlp_high_seq)
		tcp_process_tlp_ack(sk, ack, flag);


	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
		sk_dst_confirm(sk);
		sk_dst_confirm(sk);


	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
		tcp_schedule_loss_probe(sk);
	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
	lost = tp->lost - lost;			/* freshly marked lost */
	lost = tp->lost - lost;			/* freshly marked lost */
	tcp_rate_gen(sk, delivered, lost, sack_state.rate);
	tcp_rate_gen(sk, delivered, lost, sack_state.rate);
+5 −18
Original line number Original line Diff line number Diff line
@@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk)
{
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	u32 timeout, tlp_time_stamp, rto_time_stamp;
	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
	u32 timeout, rto_delta_us;


	/* No consecutive loss probes. */
	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
		tcp_rearm_rto(sk);
		return false;
	}
	/* Don't do any loss probe on a Fast Open connection before 3WHS
	/* Don't do any loss probe on a Fast Open connection before 3WHS
	 * finishes.
	 * finishes.
	 */
	 */
	if (tp->fastopen_rsk)
	if (tp->fastopen_rsk)
		return false;
		return false;


	/* TLP is only scheduled when next timer event is RTO. */
	if (icsk->icsk_pending != ICSK_TIME_RETRANS)
		return false;

	/* Schedule a loss probe in 2*RTT for SACK capable connections
	/* Schedule a loss probe in 2*RTT for SACK capable connections
	 * in Open state, that are either limited by cwnd or application.
	 * in Open state, that are either limited by cwnd or application.
	 */
	 */
@@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
	timeout = max_t(u32, timeout, msecs_to_jiffies(10));
	timeout = max_t(u32, timeout, msecs_to_jiffies(10));


	/* If RTO is shorter, just schedule TLP in its place. */
	/* If the RTO formula yields an earlier time, then use that time. */
	tlp_time_stamp = tcp_jiffies32 + timeout;
	rto_delta_us = tcp_rto_delta_us(sk);  /* How far in future is RTO? */
	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
	if (rto_delta_us > 0)
	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
		s32 delta = rto_time_stamp - tcp_jiffies32;
		if (delta > 0)
			timeout = delta;
	}


	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
				  TCP_RTO_MAX);
				  TCP_RTO_MAX);