Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 61b7c691 authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by David S. Miller
Browse files

inet: Add a 2nd listener hashtable (port+addr)



The current listener hashtable is hashed by port only.
When a process is listening at many IP addresses with the same port (e.g.
[IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
performance is degraded to a link list.  It is prone to syn attack.

UDP had a similar issue and a second hashtable was added to resolve it.

This patch adds a second hashtable for the listener's sockets.
The second hashtable is hashed by port and address.

It cannot reuse the existing skc_portaddr_node which is shared
with skc_bind_node.  TCP listener needs to use skc_bind_node.
Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to
the inet_connection_sock which the listener (like TCP) also belongs to.

The new portaddr hashtable may need two lookup (First by IP:PORT.
Second by INADDR_ANY:PORT if the IP:PORT is a not found).   Hence,
it implements a similar cut off as UDP such that it will only consult the
new portaddr hashtable if the current port-only hashtable has >10
sk in the link-list.

lhash2 and lhash2_mask are added to 'struct inet_hashinfo'.  I take
this chance to plug a 4 bytes hole.  It is done by first moving
the existing bind_bucket_cachep up and then add the new
(int lhash2_mask, *lhash2) after the existing bhash_size.

Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f0b1e64c
Loading
Loading
Loading
Loading
+2 −0
Original line number Original line Diff line number Diff line
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
 * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
 * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
 * @icsk_ulp_ops	   Pluggable ULP control hook
 * @icsk_ulp_ops	   Pluggable ULP control hook
 * @icsk_ulp_data	   ULP private data
 * @icsk_ulp_data	   ULP private data
 * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
 * @icsk_ca_state:	   Congestion control state
 * @icsk_ca_state:	   Congestion control state
 * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
 * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
 * @icsk_pending:	   Scheduled timer event
 * @icsk_pending:	   Scheduled timer event
@@ -101,6 +102,7 @@ struct inet_connection_sock {
	const struct inet_connection_sock_af_ops *icsk_af_ops;
	const struct inet_connection_sock_af_ops *icsk_af_ops;
	const struct tcp_ulp_ops  *icsk_ulp_ops;
	const struct tcp_ulp_ops  *icsk_ulp_ops;
	void			  *icsk_ulp_data;
	void			  *icsk_ulp_data;
	struct hlist_node         icsk_listen_portaddr_node;
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
	__u8			  icsk_ca_state:6,
	__u8			  icsk_ca_state:6,
				  icsk_ca_setsockopt:1,
				  icsk_ca_setsockopt:1,
+22 −6
Original line number Original line Diff line number Diff line
@@ -133,12 +133,13 @@ struct inet_hashinfo {
	/* Ok, let's try this, I give up, we do need a local binding
	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 * TCP hash as well as the others for fast bind/connect.
	 */
	 */
	struct kmem_cache		*bind_bucket_cachep;
	struct inet_bind_hashbucket	*bhash;
	struct inet_bind_hashbucket	*bhash;

	unsigned int			bhash_size;
	unsigned int			bhash_size;
	/* 4 bytes hole on 64 bit */


	struct kmem_cache		*bind_bucket_cachep;
	/* The 2nd listener table hashed by local port and address */
	unsigned int			lhash2_mask;
	struct inet_listen_hashbucket	*lhash2;


	/* All the above members are written once at bootup and
	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 * never written again _or_ are predominantly read-access.
@@ -146,14 +147,25 @@ struct inet_hashinfo {
	 * Now align to a new cache line as all the following members
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 * might be often dirty.
	 */
	 */
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	/* All sockets in TCP_LISTEN state will be in listening_hash.
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * This is the only table where wildcard'd TCP sockets can
	 * is just local port number.
	 * exist.  listening_hash is only hashed by local port number.
	 * If lhash2 is initialized, the same socket will also be hashed
	 * to lhash2 by port and address.
	 */
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;
					____cacheline_aligned_in_smp;
};
};


#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
	return &h->lhash2[hash & h->lhash2_mask];
}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
static inline struct inet_ehash_bucket *inet_ehash_bucket(
	struct inet_hashinfo *hashinfo,
	struct inet_hashinfo *hashinfo,
	unsigned int hash)
	unsigned int hash)
@@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
void inet_put_port(struct sock *sk);
void inet_put_port(struct sock *sk);


void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
			 unsigned long numentries, int scale,
			 unsigned long low_limit,
			 unsigned long high_limit);


bool inet_ehash_insert(struct sock *sk, struct sock *osk);
bool inet_ehash_insert(struct sock *sk, struct sock *osk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
+159 −9
Original line number Original line Diff line number Diff line
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
#include <linux/vmalloc.h>
#include <linux/bootmem.h>


#include <net/addrconf.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
}
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
EXPORT_SYMBOL_GPL(__inet_inherit_port);


static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
{
	u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6)
		hash = ipv6_portaddr_hash(sock_net(sk),
					  &sk->sk_v6_rcv_saddr,
					  inet_sk(sk)->inet_num);
	else
#endif
		hash = ipv4_portaddr_hash(sock_net(sk),
					  inet_sk(sk)->inet_rcv_saddr,
					  inet_sk(sk)->inet_num);
	return inet_lhash2_bucket(h, hash);
}

static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2)
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	else
		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	ilb2->count++;
	spin_unlock(&ilb2->lock);
}

static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2 ||
	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
	ilb2->count--;
	spin_unlock(&ilb2->lock);
}

static inline int compute_score(struct sock *sk, struct net *net,
static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
				const unsigned short hnum, const __be32 daddr,
				const int dif, const int sdif, bool exact_dif)
				const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
 */
 */


/* called with rcu_read_lock() : No refcount taken on the socket */
/* called with rcu_read_lock() : No refcount taken on the socket */
static struct sock *inet_lhash2_lookup(struct net *net,
				struct inet_listen_hashbucket *ilb2,
				struct sk_buff *skb, int doff,
				const __be32 saddr, __be16 sport,
				const __be32 daddr, const unsigned short hnum,
				const int dif, const int sdif)
{
	bool exact_dif = inet_exact_dif_match(net, skb);
	struct inet_connection_sock *icsk;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	u32 phash = 0;

	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
		sk = (struct sock *)icsk;
		score = compute_score(sk, net, hnum, daddr,
				      dif, sdif, exact_dif);
		if (score > hiscore) {
			if (sk->sk_reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
				result = reuseport_select_sock(sk, phash,
							       skb, doff);
				if (result)
					return result;
			}
			result = sk;
			hiscore = score;
		}
	}

	return result;
}

struct sock *__inet_lookup_listener(struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
				    struct inet_hashinfo *hashinfo,
				    struct sk_buff *skb, int doff,
				    struct sk_buff *skb, int doff,
@@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
	unsigned int hash = inet_lhashfn(net, hnum);
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
	bool exact_dif = inet_exact_dif_match(net, skb);
	bool exact_dif = inet_exact_dif_match(net, skb);
	struct inet_listen_hashbucket *ilb2;
	struct sock *sk, *result = NULL;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	int score, hiscore = 0;
	unsigned int hash2;
	u32 phash = 0;
	u32 phash = 0;


	if (ilb->count <= 10 || !hashinfo->lhash2)
		goto port_lookup;

	/* Too many sk in the ilb bucket (which is hashed by port alone).
	 * Try lhash2 (which is hashed by port and addr) instead.
	 */

	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

	result = inet_lhash2_lookup(net, ilb2, skb, doff,
				    saddr, sport, daddr, hnum,
				    dif, sdif);
	if (result)
		return result;

	/* Lookup lhash2 with INADDR_ANY */

	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

	return inet_lhash2_lookup(net, ilb2, skb, doff,
				  saddr, sport, daddr, hnum,
				  dif, sdif);

port_lookup:
	sk_for_each_rcu(sk, &ilb->head) {
	sk_for_each_rcu(sk, &ilb->head) {
		score = compute_score(sk, net, hnum, daddr,
		score = compute_score(sk, net, hnum, daddr,
				      dif, sdif, exact_dif);
				      dif, sdif, exact_dif);
@@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
	else
	else
		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
	inet_hash2(hashinfo, sk);
	ilb->count++;
	ilb->count++;
	sock_set_flag(sk, SOCK_RCU_FREE);
	sock_set_flag(sk, SOCK_RCU_FREE);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
	struct inet_listen_hashbucket *ilb;
	struct inet_listen_hashbucket *ilb;
	spinlock_t *lock;
	spinlock_t *lock;
	bool listener = false;
	bool listener = false;
	int done;


	if (sk_unhashed(sk))
	if (sk_unhashed(sk))
		return;
		return;
@@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
	}
	}
	spin_lock_bh(lock);
	spin_lock_bh(lock);
	if (sk_unhashed(sk))
		goto unlock;

	if (rcu_access_pointer(sk->sk_reuseport_cb))
	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);
		reuseport_detach_sock(sk);
	if (listener)
	if (listener) {
		done = __sk_del_node_init(sk);
		inet_unhash2(hashinfo, sk);
	else
		 __sk_del_node_init(sk);
		done = __sk_nulls_del_node_init_rcu(sk);
	if (done) {
		if (listener)
		 ilb->count--;
		 ilb->count--;
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	} else {
		__sk_nulls_del_node_init_rcu(sk);
	}
	}
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
	spin_unlock_bh(lock);
	spin_unlock_bh(lock);
}
}
EXPORT_SYMBOL_GPL(inet_unhash);
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
		INIT_HLIST_HEAD(&h->listening_hash[i].head);
		INIT_HLIST_HEAD(&h->listening_hash[i].head);
		h->listening_hash[i].count = 0;
		h->listening_hash[i].count = 0;
	}
	}

	h->lhash2 = NULL;
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
EXPORT_SYMBOL_GPL(inet_hashinfo_init);


void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
				unsigned long numentries, int scale,
				unsigned long low_limit,
				unsigned long high_limit)
{
	unsigned int i;

	h->lhash2 = alloc_large_system_hash(name,
					    sizeof(*h->lhash2),
					    numentries,
					    scale,
					    0,
					    NULL,
					    &h->lhash2_mask,
					    low_limit,
					    high_limit);

	for (i = 0; i <= h->lhash2_mask; i++) {
		spin_lock_init(&h->lhash2[i].lock);
		INIT_HLIST_HEAD(&h->lhash2[i].head);
		h->lhash2[i].count = 0;
	}
}

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
{
	unsigned int locksz = sizeof(spinlock_t);
	unsigned int locksz = sizeof(spinlock_t);
+66 −0
Original line number Original line Diff line number Diff line
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
}


/* called with rcu_read_lock() */
/* called with rcu_read_lock() */
static struct sock *inet6_lhash2_lookup(struct net *net,
		struct inet_listen_hashbucket *ilb2,
		struct sk_buff *skb, int doff,
		const struct in6_addr *saddr,
		const __be16 sport, const struct in6_addr *daddr,
		const unsigned short hnum, const int dif, const int sdif)
{
	bool exact_dif = inet6_exact_dif_match(net, skb);
	struct inet_connection_sock *icsk;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	u32 phash = 0;

	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
		sk = (struct sock *)icsk;
		score = compute_score(sk, net, hnum, daddr, dif, sdif,
				      exact_dif);
		if (score > hiscore) {
			if (sk->sk_reuseport) {
				phash = inet6_ehashfn(net, daddr, hnum,
						      saddr, sport);
				result = reuseport_select_sock(sk, phash,
							       skb, doff);
				if (result)
					return result;
			}
			result = sk;
			hiscore = score;
		}
	}

	return result;
}

struct sock *inet6_lookup_listener(struct net *net,
struct sock *inet6_lookup_listener(struct net *net,
		struct inet_hashinfo *hashinfo,
		struct inet_hashinfo *hashinfo,
		struct sk_buff *skb, int doff,
		struct sk_buff *skb, int doff,
@@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
	unsigned int hash = inet_lhashfn(net, hnum);
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
	bool exact_dif = inet6_exact_dif_match(net, skb);
	bool exact_dif = inet6_exact_dif_match(net, skb);
	struct inet_listen_hashbucket *ilb2;
	struct sock *sk, *result = NULL;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	int score, hiscore = 0;
	unsigned int hash2;
	u32 phash = 0;
	u32 phash = 0;


	if (ilb->count <= 10 || !hashinfo->lhash2)
		goto port_lookup;

	/* Too many sk in the ilb bucket (which is hashed by port alone).
	 * Try lhash2 (which is hashed by port and addr) instead.
	 */

	hash2 = ipv6_portaddr_hash(net, daddr, hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

	result = inet6_lhash2_lookup(net, ilb2, skb, doff,
				     saddr, sport, daddr, hnum,
				     dif, sdif);
	if (result)
		return result;

	/* Lookup lhash2 with in6addr_any */

	hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

	return inet6_lhash2_lookup(net, ilb2, skb, doff,
				   saddr, sport, daddr, hnum,
				   dif, sdif);

port_lookup:
	sk_for_each(sk, &ilb->head) {
	sk_for_each(sk, &ilb->head) {
		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
		if (score > hiscore) {
		if (score > hiscore) {