Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e51271d4 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp_dccp_ports'



Eric Dumazet says:

====================
tcp/dccp: better use of ephemeral ports

Big servers have bloated bind table, making very hard to succeed
ephemeral port allocations, without special containers/namespace tricks.

This patch series extends the strategy added in commit 07f4c900
("tcp/dccp: try to not exhaust ip_local_port_range in connect()").

Since ports used by connect() are much likely to be shared among them,
we give a hint to both bind() and connect() to keep the crowds separated
if possible.

Of course, if on a specific host an application needs to allocate ~30000
ports using bind(), it will still be able to do so. Same for ~30000 connect()
to a unique 2-tuple (dst addr, dst port)

New implemetation is also more friendly to softirqs and reschedules.

v2: rebase after TCP SO_REUSEPORT changes
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3134b9f0 ea8add2b
Loading
Loading
Loading
Loading
+114 −126
Original line number Diff line number Diff line
@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 * We try to allocate an odd port (and leave even ports for connect())
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
	int ret = 1, attempts = 5, port = snum;
	int smallest_size = -1, smallest_port;
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int ret, attempts = 5;
	struct net *net = sock_net(sk);
	int smallest_size = -1, smallest_rover;
	int i, low, high, attempt_half;
	struct inet_bind_bucket *tb;
	kuid_t uid = sock_i_uid(sk);
	int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
	u32 remaining, offset;

	local_bh_disable();
	if (!snum) {
		int remaining, rover, low, high;
	if (port) {
have_port:
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == port)
				goto tb_found;

		goto tb_not_found;
	}
again:
	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	if (high - low < 4)
		attempt_half = 0;
	if (attempt_half) {
			int half = low + ((high - low) >> 1);
		int half = low + (((high - low) >> 2) << 1);

		if (attempt_half == 1)
			high = half;
		else
			low = half;
	}
		remaining = (high - low) + 1;
		smallest_rover = rover = prandom_u32() % remaining + low;
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;

	offset = prandom_u32() % remaining;
	/* __inet_hash_connect() favors ports having @low parity
	 * We do the opposite to not pollute connect() users.
	 */
	offset |= 1U;
	smallest_size = -1;
		do {
			if (inet_is_local_reserved_port(net, rover))
				goto next_nolock;
			head = &hashinfo->bhash[inet_bhashfn(net, rover,
					hashinfo->bhash_size)];
			spin_lock(&head->lock);
	smallest_port = low; /* avoid compiler warning */

other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
				if (net_eq(ib_net(tb), net) && tb->port == rover) {
					if (((tb->fastreuse > 0 &&
					      sk->sk_reuse &&
					      sk->sk_state != TCP_LISTEN) ||
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (((tb->fastreuse > 0 && reuse) ||
				     (tb->fastreuseport > 0 &&
				      sk->sk_reuseport &&
				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
				      uid_eq(tb->fastuid, uid))) &&
				    (tb->num_owners < smallest_size || smallest_size == -1)) {
					smallest_size = tb->num_owners;
						smallest_rover = rover;
					smallest_port = port;
				}
					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
						snum = rover;
				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
					goto tb_found;
				goto next_port;
			}
					goto next;
		goto tb_not_found;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}
			break;
		next:
			spin_unlock(&head->lock);
		next_nolock:
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0) {

	if (smallest_size != -1) {
				snum = smallest_rover;
				goto have_snum;
		port = smallest_port;
		goto have_port;
	}
	offset--;
	if (!(offset & 1))
		goto other_parity_scan;

	if (attempt_half == 1) {
		/* OK we now try the upper half of the range */
		attempt_half = 2;
				goto again;
			}
			goto fail;
		}
		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;
	} else {
have_snum:
		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum)
				goto tb_found;
		goto other_half_scan;
	}
	tb = NULL;
	goto tb_not_found;
	return ret;

tb_not_found:
	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
				     net, head, port);
	if (!tb)
		goto fail_unlock;
tb_found:
	if (!hlist_empty(&tb->owners)) {
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		if (((tb->fastreuse > 0 &&
		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
		if (((tb->fastreuse > 0 && reuse) ||
		     (tb->fastreuseport > 0 &&
		      sk->sk_reuseport &&
		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
		      uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
		    smallest_size == -1)
			goto success;
		} else {
			ret = 1;
		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
			if ((reuse ||
			     (tb->fastreuseport > 0 &&
			      sk->sk_reuseport &&
			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
			      uid_eq(tb->fastuid, uid))) &&
			    smallest_size != -1 && --attempts >= 0) {
					spin_unlock(&head->lock);
				spin_unlock_bh(&head->lock);
				goto again;
			}

			goto fail_unlock;
		}
		}
	}
tb_not_found:
	ret = 1;
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) {
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
			tb->fastreuse = 1;
		else
		if (!reuse)
			tb->fastreuse = 0;
		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
			tb->fastreuseport = 0;
	} else {
		tb->fastreuse = reuse;
		if (sk->sk_reuseport) {
			tb->fastreuseport = 1;
			tb->fastuid = uid;
		} else
			tb->fastreuseport = 0;
		} else {
		if (tb->fastreuse &&
		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
			tb->fastreuse = 0;
		if (tb->fastreuseport &&
		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
			tb->fastreuseport = 0;
		}
	}
success:
	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, snum);
		inet_bind_hash(sk, tb, port);
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	spin_unlock_bh(&head->lock);
	return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
+85 −85
Original line number Diff line number Diff line
@@ -565,43 +565,59 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
			struct sock *, __u16, struct inet_timewait_sock **))
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	const unsigned short snum = inet_sk(sk)->inet_num;
	struct inet_timewait_sock *tw = NULL;
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int ret;
	int port = inet_sk(sk)->inet_num;
	struct net *net = sock_net(sk);

	if (!snum) {
		int i, remaining, low, high, port;
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	static u32 hint;
		u32 offset = hint + port_offset;
		struct inet_timewait_sock *tw = NULL;

		inet_get_local_port_range(net, &low, &high);
		remaining = (high - low) + 1;
	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		spin_lock_bh(&head->lock);
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			spin_unlock_bh(&head->lock);
			return 0;
		}
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, port, NULL);
		local_bh_enable();
		return ret;
	}

		/* By starting with offset being an even number,
		 * we tend to leave about 50% of ports for other uses,
		 * like bind(0).
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;

	offset = (hint + port_offset) % remaining;
	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
		offset &= ~1;

		local_bh_disable();
		for (i = 0; i < remaining; i++) {
			port = low + (i + offset) % remaining;
	offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
			spin_lock(&head->lock);
		spin_lock_bh(&head->lock);

			/* Does not bother with rcv_saddr checks,
			 * because the established check is already
			 * unique enough.
		/* Does not bother with rcv_saddr checks, because
		 * the established check is already unique enough.
		 */
		inet_bind_bucket_for_each(tb, &head->chain) {
				if (net_eq(ib_net(tb), net) &&
				    tb->port == port) {
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (tb->fastreuse >= 0 ||
				    tb->fastreuseport >= 0)
					goto next_port;
@@ -616,22 +632,25 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					     net, head, port);
		if (!tb) {
				spin_unlock(&head->lock);
				break;
			spin_unlock_bh(&head->lock);
			return -ENOMEM;
		}
		tb->fastreuse = -1;
		tb->fastreuseport = -1;
		goto ok;

next_port:
			spin_unlock(&head->lock);
		spin_unlock_bh(&head->lock);
		cond_resched();
	}
		local_bh_enable();

	offset++;
	if ((offset & 1) && remaining > 1)
		goto other_parity_scan;

	return -EADDRNOTAVAIL;

ok:
		hint += (i + 2) & ~1;
	hint += i + 2;

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, port);
@@ -642,29 +661,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
	if (tw)
		inet_twsk_bind_unhash(tw, hinfo);
	spin_unlock(&head->lock);

	if (tw)
		inet_twsk_deschedule_put(tw);

		ret = 0;
		goto out;
	}

	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
	tb  = inet_csk(sk)->icsk_bind_hash;
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
		inet_ehash_nolisten(sk, NULL);
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, snum, NULL);
out:
	local_bh_enable();
		return ret;
	}
	return 0;
}

/*