Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e26925ec authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-TCP-TS-option-use-1-ms-clock'

Eric Dumazet says:

====================
tcp: TCP TS option use 1 ms clock

TCP Timestamps option is defined in RFC 7323

Traditionally on linux, it has been tied to the internal
'jiffy' variable, because it had been a cheap and good enough
generator.

Unfortunately some distros use HZ=250 or even HZ=100 leading
to not very useful TCP timestamps.

For TCP flows in the DC, Google has used usec resolution for more
than two years with great success [1].
RCVBUF autotuning is more precise.

This series converts tp->tcp_mstamp to a plain u64 value storing
a 1 usec TCP clock.

This choice will allow us to upstream the 1 usec TS option as
discussed in IETF 97.

Kathleen Nichols [2] and others advocate for 1ms TS clocks for
network analysis. (1ms being the lowest value supported by RFC 7323.)

[1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
[2] http://netseminar.stanford.edu/seminars/02_02_17.pdf


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 9d4f97f9 9a568de4
Loading
Loading
Loading
Loading
+1 −61
Original line number Diff line number Diff line
@@ -506,66 +506,6 @@ typedef unsigned int sk_buff_data_t;
typedef unsigned char *sk_buff_data_t;
#endif

/**
 * struct skb_mstamp - multi resolution time stamps
 * @stamp_us: timestamp in us resolution
 * @stamp_jiffies: timestamp in jiffies
 */
struct skb_mstamp {
	union {
		u64		v64;
		struct {
			u32	stamp_us;
			u32	stamp_jiffies;
		};
	};
};

/**
 * skb_mstamp_get - get current timestamp
 * @cl: place to store timestamps
 */
static inline void skb_mstamp_get(struct skb_mstamp *cl)
{
	u64 val = local_clock();

	do_div(val, NSEC_PER_USEC);
	cl->stamp_us = (u32)val;
	cl->stamp_jiffies = (u32)jiffies;
}

/**
 * skb_mstamp_delta - compute the difference in usec between two skb_mstamp
 * @t1: pointer to newest sample
 * @t0: pointer to oldest sample
 */
static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
				      const struct skb_mstamp *t0)
{
	s32 delta_us = t1->stamp_us - t0->stamp_us;
	u32 delta_jiffies = t1->stamp_jiffies - t0->stamp_jiffies;

	/* If delta_us is negative, this might be because interval is too big,
	 * or local_clock() drift is too big : fallback using jiffies.
	 */
	if (delta_us <= 0 ||
	    delta_jiffies >= (INT_MAX / (USEC_PER_SEC / HZ)))

		delta_us = jiffies_to_usecs(delta_jiffies);

	return delta_us;
}

static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
				    const struct skb_mstamp *t0)
{
	s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;

	if (!diff)
		diff = t1->stamp_us - t0->stamp_us;
	return diff > 0;
}

/** 
 *	struct sk_buff - socket buffer
 *	@next: Next buffer in list
@@ -646,7 +586,7 @@ struct sk_buff {

			union {
				ktime_t		tstamp;
				struct skb_mstamp skb_mstamp;
				u64		skb_mstamp;
			};
		};
		struct rb_node	rbnode; /* used in netem & tcp stack */
+11 −11
Original line number Diff line number Diff line
@@ -123,7 +123,7 @@ struct tcp_request_sock_ops;
struct tcp_request_sock {
	struct inet_request_sock 	req;
	const struct tcp_request_sock_ops *af_specific;
	struct skb_mstamp		snt_synack; /* first SYNACK sent time */
	u64				snt_synack; /* first SYNACK sent time */
	bool				tfo_listener;
	u32				txhash;
	u32				rcv_isn;
@@ -211,7 +211,7 @@ struct tcp_sock {

	/* Information of the most recently (s)acked skb */
	struct tcp_rack {
		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
		u64 mstamp; /* (Re)sent time of the skb */
		u32 rtt_us;  /* Associated RTT */
		u32 end_seq; /* Ending TCP sequence of the skb */
		u8 advanced; /* mstamp advanced since last lost marking */
@@ -240,7 +240,7 @@ struct tcp_sock {
	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
	struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */
	u64	tcp_mstamp;	/* most recent packet received/sent */
	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
	u32	mdev_us;	/* medium deviation			*/
	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
@@ -280,8 +280,8 @@ struct tcp_sock {
	u32	delivered;	/* Total data packets delivered incl. rexmits */
	u32	lost;		/* Total data packets lost incl. rexmits */
	u32	app_limited;	/* limited until "delivered" reaches this val */
	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
	u64	first_tx_mstamp;  /* start of window send phase */
	u64	delivered_mstamp; /* time we reached "delivered" */
	u32	rate_delivered;    /* saved rate sample: packets delivered */
	u32	rate_interval_us;  /* saved rate sample: time elapsed */

@@ -337,14 +337,14 @@ struct tcp_sock {
	struct {
		u32	rtt_us;
		u32	seq;
		struct skb_mstamp time;
		u64	time;
	} rcv_rtt_est;

/* Receiver queue space */
	struct {
		int	space;
		u32	seq;
		struct skb_mstamp time;
		u64	time;
	} rcvq_space;

/* TCP-specific MTU probe information. */
+59 −15
Original line number Diff line number Diff line
@@ -519,7 +519,7 @@ static inline u32 tcp_cookie_time(void)
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
			      u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
__u32 cookie_init_timestamp(struct request_sock *req);
u64 cookie_init_timestamp(struct request_sock *req);
bool cookie_timestamp_decode(struct tcp_options_received *opt);
bool cookie_ecn_ok(const struct tcp_options_received *opt,
		   const struct net *net, const struct dst_entry *dst);
@@ -700,17 +700,61 @@ u32 __tcp_select_window(struct sock *sk);

void tcp_send_window_probe(struct sock *sk);

/* TCP timestamps are only 32-bits, this causes a slight
 * complication on 64-bit systems since we store a snapshot
 * of jiffies in the buffer control blocks below.  We decided
 * to use only the low 32-bits of jiffies and hide the ugly
 * casts with the following macro.
/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_time_stamp		((__u32)(jiffies))
#define tcp_jiffies32 ((u32)jiffies)

/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ	1000

static inline u64 tcp_clock_ns(void)
{
	return local_clock();
}

static inline u64 tcp_clock_us(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
{
	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
}


/* Refresh 1us clock of a TCP socket,
 * ensuring monotically increasing values.
 */
static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
{
	u64 val = tcp_clock_us();

	if (val > tp->tcp_mstamp)
		tp->tcp_mstamp = val;
}

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
	return max_t(s64, t1 - t0, 0);
}

static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
	return skb->skb_mstamp.stamp_jiffies;
	return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}


@@ -775,9 +819,9 @@ struct tcp_skb_cb {
			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
			__u32 delivered;
			/* start of send pipeline phase */
			struct skb_mstamp first_tx_mstamp;
			u64 first_tx_mstamp;
			/* when we reached the "delivered" count */
			struct skb_mstamp delivered_mstamp;
			u64 delivered_mstamp;
		} tx;   /* only used for outgoing skbs */
		union {
			struct inet_skb_parm	h4;
@@ -893,7 +937,7 @@ struct ack_sample {
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
	struct	skb_mstamp prior_mstamp; /* starting timestamp for interval */
	u64  prior_mstamp; /* starting timestamp for interval */
	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
	s32  delivered;		/* number of packets delivered over interval */
	long interval_us;	/* time for tp->delivered to incr "delivered" */
@@ -1242,7 +1286,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
	if (!sysctl_tcp_slow_start_after_idle || tp->packets_out ||
	    ca_ops->cong_control)
		return;
	delta = tcp_time_stamp - tp->lsndtime;
	delta = tcp_jiffies32 - tp->lsndtime;
	if (delta > inet_csk(sk)->icsk_rto)
		tcp_cwnd_restart(sk, delta);
}
@@ -1304,8 +1348,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
	const struct inet_connection_sock *icsk = &tp->inet_conn;

	return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime,
			  tcp_time_stamp - tp->rcv_tstamp);
	return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
			  tcp_jiffies32 - tp->rcv_tstamp);
}

static inline int tcp_fin_time(const struct sock *sk)
@@ -1859,7 +1903,7 @@ void tcp_init(void);
/* tcp_recovery.c */
extern void tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
			     const struct skb_mstamp *xmit_time);
			     u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);

/*
+4 −4
Original line number Diff line number Diff line
@@ -233,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
	struct dccp_sock *dp = dccp_sk(sk);
	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
	const u32 now = ccid2_time_stamp;
	const u32 now = ccid2_jiffies32;
	struct ccid2_seq *next;

	/* slow-start after idle periods (RFC 2581, RFC 2861) */
@@ -466,7 +466,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
	 * The cleanest solution is to not use the ccid2s_sent field at all
	 * and instead use DCCP timestamps: requires changes in other places.
	 */
	ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
	ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent);
}

static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -478,7 +478,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
		return;
	}

	hc->tx_last_cong = ccid2_time_stamp;
	hc->tx_last_cong = ccid2_jiffies32;

	hc->tx_cwnd      = hc->tx_cwnd / 2 ? : 1U;
	hc->tx_ssthresh  = max(hc->tx_cwnd, 2U);
@@ -731,7 +731,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)

	hc->tx_rto	 = DCCP_TIMEOUT_INIT;
	hc->tx_rpdupack  = -1;
	hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
	hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32;
	hc->tx_cwnd_used = 0;
	setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
			(unsigned long)sk);
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@
 * CCID-2 timestamping faces the same issues as TCP timestamping.
 * Hence we reuse/share as much of the code as possible.
 */
#define ccid2_time_stamp	tcp_time_stamp
#define ccid2_jiffies32	((u32)jiffies)

/* NUMDUPACK parameter from RFC 4341, p. 6 */
#define NUMDUPACK	3
Loading