Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a624f93c authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-bbr'



Neal Cardwell says:

====================
tcp: BBR congestion control algorithm

This patch series implements a new TCP congestion control algorithm:
BBR (Bottleneck Bandwidth and RTT). A paper with a detailed
description of BBR will be published in ACM Queue, September-October
2016, as "BBR: Congestion-Based Congestion Control". BBR is widely
deployed in production at Google.

The patch series starts with a set of supporting infrastructure
changes, including a few that extend the congestion control
framework. The last patch adds BBR as a TCP congestion control
module. Please see individual patches for the details.

- v3 -> v4:
 - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
   to use const to qualify all the constant parameters.
   Thanks to Stephen Hemminger.
 - In "tcp_bbr: add BBR congestion control", remove the bbr_rate_kbps()
   function, which had a 64-bit divide that would be problematic on some
   architectures, and just use bbr_rate_bytes_per_sec() directly.
   Thanks to Kenneth Klette Jonassen for suggesting this.
 - In "tcp: switch back to proper tcp_skb_cb size check in tcp_init()",
   switched from sizeof(skb->cb) to FIELD_SIZEOF.
   Thanks to Lance Richardson for suggesting this.
 - Updated "tcp_bbr: add BBR congestion control" commit message with
   performance data, more details about deployment at Google, and
   another reminder to use fq with BBR.
 - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
   to use MODULE_LICENSE("Dual BSD/GPL").

- v2 -> v3: fix another issue caught by build bots:
 - adjust rate_sample struct initialization syntax to allow gcc-4.4 to compile
   the "tcp: track data delivery rate for a TCP connection" patch; also
   adjusted some similar syntax in "tcp_bbr: add BBR congestion control"

- v1 -> v2: fix issues caught by build bots:
 - fix "tcp: export data delivery rate" to use rate64 instead of rate,
   so there is a 64-bit numerator for the do_div call
 - fix conflicting definitions for minmax caused by
   "tcp: use windowed min filter library for TCP min_rtt estimation"
   with a new commit:
   tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming conflict
 - fix warning about the use of __packed in
   "tcp: track data delivery rate for a TCP connection",
   which involves the addition of a new commit:
   tcp: switch back to proper tcp_skb_cb size check in tcp_init()
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 94d308d0 0f8782ea
Loading
Loading
Loading
Loading
+10 −4
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@


#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -212,7 +213,8 @@ struct tcp_sock {
		u8 reord;    /* reordering detected */
	} rack;
	u16	advmss;		/* Advertised MSS			*/
	u8	unused;
	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
		unused:7;
	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
		thin_lto    : 1,/* Use linear timeouts for thin streams */
		thin_dupack : 1,/* Fast retransmit on first dupack      */
@@ -234,9 +236,7 @@ struct tcp_sock {
	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
	u32	rttvar_us;	/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/
	struct rtt_meas {
		u32 rtt, ts;	/* RTT in usec and sampling time in jiffies. */
	} rtt_min[3];
	struct  minmax rtt_min;

	u32	packets_out;	/* Packets which are "in flight"	*/
	u32	retrans_out;	/* Retransmitted packets out		*/
@@ -268,6 +268,12 @@ struct tcp_sock {
				 * receiver in Recovery. */
	u32	prr_out;	/* Total number of pkts sent during Recovery. */
	u32	delivered;	/* Total data packets delivered incl. rexmits */
	u32	lost;		/* Total data packets lost incl. rexmits */
	u32	app_limited;	/* limited until "delivered" reaches this val */
	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
	u32	rate_delivered;    /* saved rate sample: packets delivered */
	u32	rate_interval_us;  /* saved rate sample: time elapsed */

 	u32	rcv_wnd;	/* Current receiver window		*/
	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+37 −0
Original line number Diff line number Diff line
/**
 * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
 *
 */
#ifndef MINMAX_H
#define MINMAX_H

#include <linux/types.h>

/* A single data point for our parameterized min-max tracker */
struct minmax_sample {
	u32	t;	/* time measurement was taken */
	u32	v;	/* value measured */
};

/* State for the parameterized min-max tracker */
struct minmax {
	struct minmax_sample s[3];
};

static inline u32 minmax_get(const struct minmax *m)
{
	return m->s[0].v;
}

static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
{
	struct minmax_sample val = { .t = t, .v = meas };

	m->s[2] = m->s[1] = m->s[0] = val;
	return m->s[0].v;
}

u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);

#endif
+2 −2
Original line number Diff line number Diff line
@@ -134,8 +134,8 @@ struct inet_connection_sock {
	} icsk_mtup;
	u32			  icsk_user_timeout;

	u64			  icsk_ca_priv[64 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
	u64			  icsk_ca_priv[88 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (11 * sizeof(u64))
};

#define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+50 −3
Original line number Diff line number Diff line
@@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */

u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
		     int min_tso_segs);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle);
bool tcp_may_send_now(struct sock *sk);
@@ -671,7 +673,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
	return tp->rtt_min[0].rtt;
	return minmax_get(&tp->rtt_min);
}

/* Compute the actual receive window we are currently advertising.
@@ -763,8 +765,16 @@ struct tcp_skb_cb {
	__u32		ack_seq;	/* Sequence number ACK'd	*/
	union {
		struct {
			/* There is space for up to 20 bytes */
			__u32 in_flight;/* Bytes in flight when packet sent */
			/* There is space for up to 24 bytes */
			__u32 in_flight:30,/* Bytes in flight at transmit */
			      is_app_limited:1, /* cwnd not fully used? */
			      unused:1;
			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
			__u32 delivered;
			/* start of send pipeline phase */
			struct skb_mstamp first_tx_mstamp;
			/* when we reached the "delivered" count */
			struct skb_mstamp delivered_mstamp;
		} tx;   /* only used for outgoing skbs */
		union {
			struct inet_skb_parm	h4;
@@ -860,6 +870,27 @@ struct ack_sample {
	u32 in_flight;
};

/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
	struct	skb_mstamp prior_mstamp; /* starting timestamp for interval */
	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
	s32  delivered;		/* number of packets delivered over interval */
	long interval_us;	/* time for tp->delivered to incr "delivered" */
	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
	int  losses;		/* number of packets marked lost upon ACK */
	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
	u32  prior_in_flight;	/* in flight before this ACK */
	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
	bool is_retrans;	/* is sample from retransmission? */
};

struct tcp_congestion_ops {
	struct list_head	list;
	u32 key;
@@ -884,6 +915,14 @@ struct tcp_congestion_ops {
	u32  (*undo_cwnd)(struct sock *sk);
	/* hook for packet ack accounting (optional) */
	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
	/* suggest number of segments for each skb to transmit (optional) */
	u32 (*tso_segs_goal)(struct sock *sk);
	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
	u32 (*sndbuf_expand)(struct sock *sk);
	/* call when packets are delivered to update cwnd and pacing rate,
	 * after all the ca_state processing. (optional)
	 */
	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
	/* get info for inet_diag (optional) */
	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
			   union tcp_cc_info *info);
@@ -946,6 +985,14 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
		icsk->icsk_ca_ops->cwnd_event(sk, event);
}

/* From tcp_rate.c */
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
			    struct rate_sample *rs);
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
		  struct skb_mstamp *now, struct rate_sample *rs);
void tcp_rate_check_app_limited(struct sock *sk);

/* These functions determine how the current flow behaves in respect of SACK
 * handling. SACK is negotiated with the peer, and therefore it can vary
 * between different flows.
+13 −0
Original line number Diff line number Diff line
@@ -124,6 +124,7 @@ enum {
	INET_DIAG_PEERS,
	INET_DIAG_PAD,
	INET_DIAG_MARK,
	INET_DIAG_BBRINFO,
	__INET_DIAG_MAX,
};

@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
	__u32	dctcp_ab_tot;
};

/* INET_DIAG_BBRINFO */

struct tcp_bbr_info {
	/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
	__u32	bbr_bw_lo;		/* lower 32 bits of bw */
	__u32	bbr_bw_hi;		/* upper 32 bits of bw */
	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
};

union tcp_cc_info {
	struct tcpvegas_info	vegas;
	struct tcp_dctcp_info	dctcp;
	struct tcp_bbr_info	bbr;
};
#endif /* _UAPI_INET_DIAG_H_ */
Loading