Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit dfb4b9dc authored by David S. Miller's avatar David S. Miller
Browse files

[TCP] Vegas: timestamp before clone



We have to store the congestion control timestamp on the SKB before we
clone it, not after.  Else we get no timestamping information at all.

tcp_transmit_skb() has been reworked so that we can do the timestamp
still in one spot, instead of at all the call sites.

Problem discovered, and initial fix, from Tom Young
<tyo@ee.unimelb.edu.au>.

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0d7bef60
Loading
Loading
Loading
Loading
+124 −109
Original line number Original line Diff line number Diff line
@@ -262,30 +262,45 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 * We are working here with either a clone of the original
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
 */
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
{
	if (skb != NULL) {
	const struct inet_connection_sock *icsk = inet_csk(sk);
	const struct inet_connection_sock *icsk = inet_csk(sk);
		struct inet_sock *inet = inet_sk(sk);
	struct inet_sock *inet;
		struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_sock *tp;
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
	struct tcp_skb_cb *tcb;
		int tcp_header_size = tp->tcp_header_len;
	int tcp_header_size;
	struct tcphdr *th;
	struct tcphdr *th;
	int sysctl_flags;
	int sysctl_flags;
	int err;
	int err;


		BUG_ON(!tcp_skb_pcount(skb));
	BUG_ON(!skb || !tcp_skb_pcount(skb));

	/* If congestion control is doing timestamping, we must
	 * take such a timestamp before we potentially clone/copy.
	 */
	if (icsk->icsk_ca_ops->rtt_sample)
		__net_timestamp(skb);

	if (likely(clone_it)) {
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}

	inet = inet_sk(sk);
	tp = tcp_sk(sk);
	tcb = TCP_SKB_CB(skb);
	tcp_header_size = tp->tcp_header_len;


#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4
#define SYSCTL_FLAG_SACK	0x4


		/* If congestion control is doing timestamping */
		if (icsk->icsk_ca_ops->rtt_sample)
			__net_timestamp(skb);

	sysctl_flags = 0;
	sysctl_flags = 0;
		if (tcb->flags & TCPCB_FLAG_SYN) {
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
		if(sysctl_tcp_timestamps) {
		if(sysctl_tcp_timestamps) {
			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
@@ -300,12 +315,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
		}
		}
		} else if (tp->rx_opt.eff_sacks) {
	} else if (unlikely(tp->rx_opt.eff_sacks)) {
		/* A SACK is 2 pad bytes, a 2 byte header, plus
		/* A SACK is 2 pad bytes, a 2 byte header, plus
		 * 2 32-bit sequence numbers for each SACK block.
		 * 2 32-bit sequence numbers for each SACK block.
		 */
		 */
		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
				    (tp->rx_opt.eff_sacks *
				     TCPOLEN_SACK_PERBLOCK));
	}
	}
		
		
	if (tcp_packets_in_flight(tp) == 0)
	if (tcp_packets_in_flight(tp) == 0)
@@ -320,8 +336,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
	th->dest		= inet->dport;
	th->dest		= inet->dport;
	th->seq			= htonl(tcb->seq);
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(tp->rcv_nxt);
	th->ack_seq		= htonl(tp->rcv_nxt);
		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
	*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
		if (tcb->flags & TCPCB_FLAG_SYN) {
					tcb->flags);

	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		/* RFC1323: The window in SYN & SYN/ACK segments
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 * is never scaled.
		 */
		 */
@@ -332,13 +350,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
	th->check		= 0;
	th->check		= 0;
	th->urg_ptr		= 0;
	th->urg_ptr		= 0;


		if (tp->urg_mode &&
	if (unlikely(tp->urg_mode &&
		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
		     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
		th->urg_ptr		= htons(tp->snd_up-tcb->seq);
		th->urg_ptr		= htons(tp->snd_up-tcb->seq);
		th->urg			= 1;
		th->urg			= 1;
	}
	}


		if (tcb->flags & TCPCB_FLAG_SYN) {
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_syn_build_options((__u32 *)(th + 1),
		tcp_syn_build_options((__u32 *)(th + 1),
				      tcp_advertise_mss(sk),
				      tcp_advertise_mss(sk),
				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
@@ -350,12 +368,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
	} else {
	} else {
		tcp_build_and_update_options((__u32 *)(th + 1),
		tcp_build_and_update_options((__u32 *)(th + 1),
					     tp, tcb->when);
					     tp, tcb->when);

		TCP_ECN_send(sk, tp, skb, tcp_header_size);
		TCP_ECN_send(sk, tp, skb, tcp_header_size);
	}
	}

	tp->af_specific->send_check(sk, th, skb->len, skb);
	tp->af_specific->send_check(sk, th, skb->len, skb);


		if (tcb->flags & TCPCB_FLAG_ACK)
	if (likely(tcb->flags & TCPCB_FLAG_ACK))
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));


	if (skb->len != tcp_header_size)
	if (skb->len != tcp_header_size)
@@ -364,7 +382,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
	TCP_INC_STATS(TCP_MIB_OUTSEGS);
	TCP_INC_STATS(TCP_MIB_OUTSEGS);


	err = tp->af_specific->queue_xmit(skb, 0);
	err = tp->af_specific->queue_xmit(skb, 0);
		if (err <= 0)
	if (unlikely(err <= 0))
		return err;
		return err;


	tcp_enter_cwr(sk);
	tcp_enter_cwr(sk);
@@ -376,8 +394,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
	 * invokes us to send less aggressively.
	 * invokes us to send less aggressively.
	 */
	 */
	return err == NET_XMIT_CN ? 0 : err;
	return err == NET_XMIT_CN ? 0 : err;
	}

	return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
#undef SYSCTL_FLAG_SACK
@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)


		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		TCP_SKB_CB(skb)->when = tcp_time_stamp;


		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
			break;
			break;


		/* Advance the send_head.  This one is sent out.
		/* Advance the send_head.  This one is sent out.
@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
		/* Send it out now. */
		/* Send it out now. */
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		TCP_SKB_CB(skb)->when = tcp_time_stamp;


		if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
			update_send_head(sk, tp, skb);
			update_send_head(sk, tp, skb);
			tcp_cwnd_validate(sk, tp);
			tcp_cwnd_validate(sk, tp);
			return;
			return;
@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
	 */
	 */
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;


	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				    pskb_copy(skb, GFP_ATOMIC):
				    skb_clone(skb, GFP_ATOMIC)));


	if (err == 0) {
	if (err == 0) {
		/* Update global TCP statistics. */
		/* Update global TCP statistics. */
@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	if (tcp_transmit_skb(sk, skb))
	if (tcp_transmit_skb(sk, skb, 0, priority))
		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
}
}


@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
		TCP_ECN_send_synack(tcp_sk(sk), skb);
		TCP_ECN_send_synack(tcp_sk(sk), skb);
	}
	}
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
}


/*
/*
@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk)
	__skb_queue_tail(&sk->sk_write_queue, buff);
	__skb_queue_tail(&sk->sk_write_queue, buff);
	sk_charge_skb(sk, buff);
	sk_charge_skb(sk, buff);
	tp->packets_out += tcp_skb_pcount(buff);
	tp->packets_out += tcp_skb_pcount(buff);
	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
	tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);


	/* Timer for repeating the SYN until an answer. */
	/* Timer for repeating the SYN until an answer. */
@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
		/* Send it off, this clears delayed acks for us. */
		/* Send it off, this clears delayed acks for us. */
		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
		TCP_SKB_CB(buff)->when = tcp_time_stamp;
		TCP_SKB_CB(buff)->when = tcp_time_stamp;
		tcp_transmit_skb(sk, buff);
		tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
	}
	}
}
}


@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
	TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
	TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	return tcp_transmit_skb(sk, skb);
	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
}


int tcp_write_wakeup(struct sock *sk)
int tcp_write_wakeup(struct sock *sk)
@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)


			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
			TCP_SKB_CB(skb)->when = tcp_time_stamp;
			TCP_SKB_CB(skb)->when = tcp_time_stamp;
			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
			err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
			if (!err) {
			if (!err) {
				update_send_head(sk, tp, skb);
				update_send_head(sk, tp, skb);
			}
			}