Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ec0a1966 authored by David S. Miller's avatar David S. Miller
Browse files

tcp: Revert 'process defer accept as established' changes.



This reverts two changesets, ec3c0982
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0a
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").

This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.

Ilpo Järvinen first spotted the locking problems.  The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.

Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock.  The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.

Next is a problem noticed by Vitaliy Gusev, he noted:

----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
> 		goto death;
> 	}
>
>+	if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+		tcp_send_active_reset(sk, GFP_ATOMIC);
>+		goto death;

Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------

Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:

----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------

So revert this thing for now.

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f23d60de
Loading
Loading
Loading
Loading
+0 −7
Original line number Diff line number Diff line
@@ -239,11 +239,6 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
	return (struct tcp_request_sock *)req;
}

struct tcp_deferred_accept_info {
	struct sock *listen_sk;
	struct request_sock *request;
};

struct tcp_sock {
	/* inet_connection_sock has to be the first member of tcp_sock */
	struct inet_connection_sock	inet_conn;
@@ -379,8 +374,6 @@ struct tcp_sock {
	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
	int			linger2;

	struct tcp_deferred_accept_info defer_tcp_accept;

	unsigned long last_synq_overflow; 

	u32	tso_deferred;
+2 −2
Original line number Diff line number Diff line
@@ -115,8 +115,8 @@ struct request_sock_queue {
	struct request_sock	*rskq_accept_head;
	struct request_sock	*rskq_accept_tail;
	rwlock_t		syn_wait_lock;
	u16			rskq_defer_accept;
	/* 2 bytes hole, try to pack */
	u8			rskq_defer_accept;
	/* 3 bytes hole, try to pack */
	struct listen_sock	*listen_opt;
};

+0 −1
Original line number Diff line number Diff line
@@ -139,7 +139,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_KEEPINTVL	32767
#define MAX_TCP_KEEPCNT		127
#define MAX_TCP_SYNCNT		127
#define MAX_TCP_ACCEPT_DEFERRED 65535

#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */

+8 −3
Original line number Diff line number Diff line
@@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
	struct inet_connection_sock *icsk = inet_csk(parent);
	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	struct listen_sock *lopt = queue->listen_opt;
	int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
	int thresh = max_retries;
	unsigned long now = jiffies;
	struct request_sock **reqp, *req;
	int i, budget;
@@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
		}
	}

	if (queue->rskq_defer_accept)
		max_retries = queue->rskq_defer_accept;

	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
	i = lopt->clock_hand;

@@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
		reqp=&lopt->syn_table[i];
		while ((req = *reqp) != NULL) {
			if (time_after_eq(now, req->expires)) {
				if (req->retrans < thresh &&
				    !req->rsk_ops->rtx_syn_ack(parent, req)) {
				if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) &&
				    (inet_rsk(req)->acked ||
				     !req->rsk_ops->rtx_syn_ack(parent, req))) {
					unsigned long timeo;

					if (req->retrans++ == 0)
+11 −7
Original line number Diff line number Diff line
@@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
		break;

	case TCP_DEFER_ACCEPT:
		if (val < 0) {
			err = -EINVAL;
		} else {
			if (val > MAX_TCP_ACCEPT_DEFERRED)
				val = MAX_TCP_ACCEPT_DEFERRED;
			icsk->icsk_accept_queue.rskq_defer_accept = val;
		icsk->icsk_accept_queue.rskq_defer_accept = 0;
		if (val > 0) {
			/* Translate value in seconds to number of
			 * retransmits */
			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
			       val > ((TCP_TIMEOUT_INIT / HZ) <<
				       icsk->icsk_accept_queue.rskq_defer_accept))
				icsk->icsk_accept_queue.rskq_defer_accept++;
			icsk->icsk_accept_queue.rskq_defer_accept++;
		}
		break;

@@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
		break;
	case TCP_DEFER_ACCEPT:
		val = icsk->icsk_accept_queue.rskq_defer_accept;
		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
		break;
	case TCP_WINDOW_CLAMP:
		val = tp->window_clamp;
Loading