Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 740b0f18 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

tcp: switch rtt estimations to usec resolution



Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.

FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'

TCP_CONG_RTT_STAMP is no longer needed.

As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.

In this example ss command displays a srtt of 32 usecs (10Gbit link)

lpk51:~# ./ss -i dst lpk52
Netid  State      Recv-Q Send-Q   Local Address:Port       Peer
Address:Port
tcp    ESTAB      0      1         10.246.11.51:42959
10.246.11.52:64614
         cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559

Updated iproute2 ip command displays :

lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51

Old binary displays :

lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51

With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 363ec392
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -201,10 +201,10 @@ struct tcp_sock {
	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
	u32	srtt;		/* smoothed round trip time << 3	*/
	u32	mdev;		/* medium deviation			*/
	u32	mdev_max;	/* maximal mdev for the last rtt period	*/
	u32	rttvar;		/* smoothed mdev_max			*/
	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
	u32	mdev_us;	/* medium deviation			*/
	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
	u32	rttvar_us;	/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/

	u32	packets_out;	/* Packets which are "in flight"	*/
+7 −3
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#include <linux/crypto.h>
#include <linux/cryptohash.h>
#include <linux/kref.h>
#include <linux/ktime.h>

#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
			     struct ip_options *opt);
#ifdef CONFIG_SYN_COOKIES
#include <linux/ktime.h>

/* Syncookies use a monotonic timer which increments every 64 seconds.
 * This counter is used both as a hash input and partially encoded into
@@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk)

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
	return (tp->srtt >> 3) + tp->rttvar;
	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
@@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk)
	return rto_min;
}

static inline u32 tcp_rto_min_us(struct sock *sk)
{
	return jiffies_to_usecs(tcp_rto_min(sk));
}

/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
@@ -778,7 +783,6 @@ enum tcp_ca_event {
#define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CONG_NON_RESTRICTED 0x1
#define TCP_CONG_RTT_STAMP	0x2

struct tcp_congestion_ops {
	struct list_head	list;
+5 −2
Original line number Diff line number Diff line
@@ -11,12 +11,15 @@
#define TCP_METRICS_GENL_VERSION	0x1

enum tcp_metric_index {
	TCP_METRIC_RTT,
	TCP_METRIC_RTTVAR,
	TCP_METRIC_RTT,		/* in ms units */
	TCP_METRIC_RTTVAR,	/* in ms units */
	TCP_METRIC_SSTHRESH,
	TCP_METRIC_CWND,
	TCP_METRIC_REORDERING,

	TCP_METRIC_RTT_US,	/* in usec units */
	TCP_METRIC_RTTVAR_US,	/* in usec units */

	/* Always last.  */
	__TCP_METRIC_MAX,
};
+4 −4
Original line number Diff line number Diff line
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
	INIT_LIST_HEAD(&tp->tsq_node);

	icsk->icsk_rto = TCP_TIMEOUT_INIT;
	tp->mdev = TCP_TIMEOUT_INIT;
	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
@@ -2339,7 +2339,7 @@ int tcp_disconnect(struct sock *sk, int flags)

	sk->sk_shutdown = 0;
	sock_reset_flag(sk, SOCK_DONE);
	tp->srtt = 0;
	tp->srtt_us = 0;
	if ((tp->write_seq += tp->max_window + 2) == 0)
		tp->write_seq = 1;
	icsk->icsk_backoff = 0;
@@ -2783,8 +2783,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)

	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
	info->tcpi_rtt = tp->srtt_us >> 3;
	info->tcpi_rttvar = tp->mdev_us >> 2;
	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
	info->tcpi_snd_cwnd = tp->snd_cwnd;
	info->tcpi_advmss = tp->advmss;
+0 −4
Original line number Diff line number Diff line
@@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
	/* divide by bic_scale and by constant Srtt (100ms) */
	do_div(cube_factor, bic_scale * 10);

	/* hystart needs ms clock resolution */
	if (hystart && HZ < 1000)
		cubictcp.flags |= TCP_CONG_RTT_STAMP;

	return tcp_register_congestion_control(&cubictcp);
}

Loading