Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 95bd09eb authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

tcp: TSO packets automatic sizing



After hearing many people over past years complaining against TSO being
bursty or even buggy, we are proud to present automatic sizing of TSO
packets.

One part of the problem is that tcp_tso_should_defer() uses an heuristic
relying on upcoming ACKS instead of a timer, but more generally, having
big TSO packets makes little sense for low rates, as it tends to create
micro bursts on the network, and general consensus is to reduce the
buffering amount.

This patch introduces a per socket sk_pacing_rate, that approximates
the current sending rate, and allows us to size the TSO packets so
that we try to send one packet every ms.

This field could be set by other transports.

Patch has no impact for high speed flows, where having large TSO packets
makes sense to reach line rate.

For other flows, this helps better packet scheduling and ACK clocking.

This patch increases performance of TCP flows in lossy environments.

A new sysctl (tcp_min_tso_segs) is added, to specify the
minimal size of a TSO packet (default being 2).

A follow-up patch will provide a new packet scheduler (FQ), using
sk_pacing_rate as an input to perform optional per flow pacing.

This explains why we chose to set sk_pacing_rate to twice the current
rate, allowing 'slow start' ramp up.

sk_pacing_rate = 2 * cwnd * mss / srtt

v2: Neal Cardwell reported a suspect deferring of last two segments on
initial write of 10 MSS, I had to change tcp_tso_should_defer() to take
into account tp->xmit_size_goal_segs

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: default avatarYuchung Cheng <ycheng@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b800c3b9
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
tcp_timestamps - BOOLEAN
	Enable timestamps as defined in RFC1323.

tcp_min_tso_segs - INTEGER
	Minimal number of segments per TSO frame.
	Since linux-3.12, TCP does an automatic sizing of TSO frames,
	depending on flow rate, instead of filling 64Kbytes packets.
	For specific usages, it's possible to force TCP to build big
	TSO frames. Note that TCP stack might split too big TSO packets
	if available window is too small.
	Default: 2

tcp_tso_win_divisor - INTEGER
	This allows control over what percentage of the congestion window
	can be consumed by a single TSO frame.
+2 −0
Original line number Diff line number Diff line
@@ -232,6 +232,7 @@ struct cg_proto;
  *	@sk_napi_id: id of the last napi context to receive data for sk
  *	@sk_ll_usec: usecs to busypoll when there is no data
  *	@sk_allocation: allocation mode
  *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *	@sk_sndbuf: size of send buffer in bytes
  *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
	kmemcheck_bitfield_end(flags);
	int			sk_wmem_queued;
	gfp_t			sk_allocation;
	u32			sk_pacing_rate; /* bytes per second */
	netdev_features_t	sk_route_caps;
	netdev_features_t	sk_route_nocaps;
	int			sk_gso_type;
+1 −0
Original line number Diff line number Diff line
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;

extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
+10 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,6 +761,15 @@ static struct ctl_table ipv4_table[] = {
		.extra1		= &zero,
		.extra2		= &four,
	},
	{
		.procname	= "tcp_min_tso_segs",
		.data		= &sysctl_tcp_min_tso_segs,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
		.extra2		= &gso_max_segs,
	},
	{
		.procname	= "udp_mem",
		.data		= &sysctl_udp_mem,
+23 −5
Original line number Diff line number Diff line
@@ -283,6 +283,8 @@

int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;

int sysctl_tcp_min_tso_segs __read_mostly = 2;

struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);

@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
	xmit_size_goal = mss_now;

	if (large_allowed && sk_can_gso(sk)) {
		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
				  inet_csk(sk)->icsk_af_ops->net_header_len -
				  inet_csk(sk)->icsk_ext_hdr_len -
				  tp->tcp_header_len);
		u32 gso_size, hlen;

		/* Maybe we should/could use sk->sk_prot->max_header here ? */
		hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
		       inet_csk(sk)->icsk_ext_hdr_len +
		       tp->tcp_header_len;

		/* Goal is to send at least one packet per ms,
		 * not one big TSO packet every 100 ms.
		 * This preserves ACK clocking and is consistent
		 * with tcp_tso_should_defer() heuristic.
		 */
		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
		gso_size = max_t(u32, gso_size,
				 sysctl_tcp_min_tso_segs * mss_now);

		xmit_size_goal = min_t(u32, gso_size,
				       sk->sk_gso_max_size - 1 - hlen);

		/* TSQ : try to have two TSO segments in flight */
		/* TSQ : try to have at least two segments in flight
		 * (one in NIC TX ring, another in Qdisc)
		 */
		xmit_size_goal = min_t(u32, xmit_size_goal,
				       sysctl_tcp_limit_output_bytes >> 1);

Loading