Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 834e0ecf authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-remove-prequeue-and-header-prediction'



Florian Westphal says:

====================
tcp: remove prequeue and header prediction

During a hallway discussion with Eric Dumazet at Netdev 1.2 in
Tokyo some maybe-not-so-useful-anymore TCP stack features came up,
among these header prediction and prequeueing.

In brief, TCP prequeue assumes a single-process-blocking-read design,
which is not that common anymore. The most frequently used high-performance
networking program that is an excellent fit for these features is netperf.

The idea behind prequeueing is to move part of tcp processing, including
retransmit queue cleaning, to process context.

With (e)poll designs, prequeue is always skipped, so for such programs
this is dead-code removal.

Header prediction is also less useful nowadays.
For packet trains, GRO will do packet aggregation so we do not get the
per-packet benefit that this had before GRO anymore.

Because of SACK, header prediction also will be ineffective once
a connection suffers even light packet losses.

code removal aside, after this change processing always occurs in BH
context, this allows to experiment e.g. with doing bulk freeing of
skb heads when incoming ACKs clean packets from the retransmit queue.

There are no changes since the RFC, except in last patch (i missed
another no-longer-used mib counter). I also edited a few commit messages.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 764646b0 3282e655
Loading
Loading
Loading
Loading
+1 −6
Original line number Diff line number Diff line
@@ -353,12 +353,7 @@ tcp_l3mdev_accept - BOOLEAN
	compiled with CONFIG_NET_L3_MASTER_DEV.

tcp_low_latency - BOOLEAN
	If set, the TCP stack makes decisions that prefer lower
	latency as opposed to higher throughput.  By default, this
	option is not set meaning that higher throughput is preferred.
	An example of an application where this default should be
	changed would be a Beowulf compute cluster.
	Default: 0
	This is a legacy option, it has no effect anymore.

tcp_max_orphans - INTEGER
	Maximal number of TCP sockets not attached to any user file handle,
+0 −15
Original line number Diff line number Diff line
@@ -147,12 +147,6 @@ struct tcp_sock {
	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
	u16	gso_segs;	/* Max number of segs per GSO packet	*/

/*
 *	Header prediction flags
 *	0x5?10 << 16 + snd_wnd in net byte order
 */
	__be32	pred_flags;

/*
 *	RFC793 variables by their proper names. This means you can
 *	read the code and the spec side by side (and laugh ...)
@@ -192,15 +186,6 @@ struct tcp_sock {

	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */

	/* Data for direct copy to user */
	struct {
		struct sk_buff_head	prequeue;
		struct task_struct	*task;
		struct msghdr		*msg;
		int			memory;
		int			len;
	} ucopy;

	u32	snd_wl1;	/* Sequence for window update		*/
	u32	snd_wnd;	/* The window we expect to receive	*/
	u32	max_window;	/* Maximal window ever seen from peer	*/
+2 −38
Original line number Diff line number Diff line
@@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
@@ -632,29 +631,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
			       ntohl(TCP_FLAG_ACK) |
			       snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
	    tp->rcv_wnd &&
	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
	    !tp->urg_data)
		tcp_fast_path_on(tp);
}

/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
@@ -904,9 +880,8 @@ enum tcp_ca_event {

/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
	CA_ACK_WIN_UPDATE	= (1 << 0),	/* ACK updated window */
	CA_ACK_ECE		= (1 << 1),	/* ECE bit is set on ack */
};

/*
@@ -1244,17 +1219,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
		__tcp_checksum_complete(skb);
}

/* Prequeue for VJ style copy to user, combined with checksumming. */

static inline void tcp_prequeue_init(struct tcp_sock *tp)
{
	tp->ucopy.task = NULL;
	tp->ucopy.len = 0;
	tp->ucopy.memory = 0;
	skb_queue_head_init(&tp->ucopy.prequeue);
}

bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);

+0 −9
Original line number Diff line number Diff line
@@ -184,14 +184,7 @@ enum
	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
	LINUX_MIB_LISTENOVERFLOWS,		/* ListenOverflows */
	LINUX_MIB_LISTENDROPS,			/* ListenDrops */
	LINUX_MIB_TCPPREQUEUED,			/* TCPPrequeued */
	LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG,	/* TCPDirectCopyFromBacklog */
	LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE,	/* TCPDirectCopyFromPrequeue */
	LINUX_MIB_TCPPREQUEUEDROPPED,		/* TCPPrequeueDropped */
	LINUX_MIB_TCPHPHITS,			/* TCPHPHits */
	LINUX_MIB_TCPHPHITSTOUSER,		/* TCPHPHitsToUser */
	LINUX_MIB_TCPPUREACKS,			/* TCPPureAcks */
	LINUX_MIB_TCPHPACKS,			/* TCPHPAcks */
	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
@@ -208,14 +201,12 @@ enum
	LINUX_MIB_TCPSACKFAILURES,		/* TCPSackFailures */
	LINUX_MIB_TCPLOSSFAILURES,		/* TCPLossFailures */
	LINUX_MIB_TCPFASTRETRANS,		/* TCPFastRetrans */
	LINUX_MIB_TCPFORWARDRETRANS,		/* TCPForwardRetrans */
	LINUX_MIB_TCPSLOWSTARTRETRANS,		/* TCPSlowStartRetrans */
	LINUX_MIB_TCPTIMEOUTS,			/* TCPTimeouts */
	LINUX_MIB_TCPLOSSPROBES,		/* TCPLossProbes */
	LINUX_MIB_TCPLOSSPROBERECOVERY,		/* TCPLossProbeRecovery */
	LINUX_MIB_TCPRENORECOVERYFAIL,		/* TCPRenoRecoveryFail */
	LINUX_MIB_TCPSACKRECOVERYFAIL,		/* TCPSackRecoveryFail */
	LINUX_MIB_TCPSCHEDULERFAILED,		/* TCPSchedulerFailed */
	LINUX_MIB_TCPRCVCOLLAPSED,		/* TCPRcvCollapsed */
	LINUX_MIB_TCPDSACKOLDSENT,		/* TCPDSACKOldSent */
	LINUX_MIB_TCPDSACKOFOSENT,		/* TCPDSACKOfoSent */
+0 −9
Original line number Diff line number Diff line
@@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
	SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
	SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
	SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
	SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
	SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
@@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
Loading