Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 230140cf authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

[INET]: Remove per bucket rwlock in tcp/dccp ehash table.



As done two years ago on IP route cache table (commit
22c047cc) , we can avoid using one
lock per hash bucket for the huge TCP/DCCP hash tables.

On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for
litle performance differences. (we hit a different cache line for the
rwlock, but then the bucket cache line have a better sharing factor
among cpus, since we dirty it less often). For netstat or ss commands
that want a full scan of hash table, we perform fewer memory accesses.

Using a 'small' table of hashed rwlocks should be more than enough to
provide correct SMP concurrency between different buckets, without
using too much memory. Sizing of this table depends on
num_possible_cpus() and various CONFIG settings.

This patch provides some locking abstraction that may ease a future
work using a different model for TCP/DCCP table.

Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Acked-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent efac5276
Loading
Loading
Loading
Loading
+65 −6
Original line number Diff line number Diff line
@@ -37,7 +37,6 @@
 * I'll experiment with dynamic table growth later.
 */
struct inet_ehash_bucket {
	rwlock_t	  lock;
	struct hlist_head chain;
	struct hlist_head twchain;
};
@@ -100,6 +99,9 @@ struct inet_hashinfo {
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */
	struct inet_ehash_bucket	*ehash;
	rwlock_t			*ehash_locks;
	unsigned int			ehash_size;
	unsigned int			ehash_locks_mask;

	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
@@ -107,7 +109,7 @@ struct inet_hashinfo {
	struct inet_bind_hashbucket	*bhash;

	unsigned int			bhash_size;
	unsigned int			ehash_size;
	/* Note : 4 bytes padding on 64 bit arches */

	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
@@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
}

static inline rwlock_t *inet_ehash_lockp(
	struct inet_hashinfo *hashinfo,
	unsigned int hash)
{
	return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
}

static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
	unsigned int i, size = 256;
#if defined(CONFIG_PROVE_LOCKING)
	unsigned int nr_pcpus = 2;
#else
	unsigned int nr_pcpus = num_possible_cpus();
#endif
	if (nr_pcpus >= 4)
		size = 512;
	if (nr_pcpus >= 8)
		size = 1024;
	if (nr_pcpus >= 16)
		size = 2048;
	if (nr_pcpus >= 32)
		size = 4096;
	if (sizeof(rwlock_t) != 0) {
#ifdef CONFIG_NUMA
		if (size * sizeof(rwlock_t) > PAGE_SIZE)
			hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t));
		else
#endif
		hashinfo->ehash_locks =	kmalloc(size * sizeof(rwlock_t),
						GFP_KERNEL);
		if (!hashinfo->ehash_locks)
			return ENOMEM;
		for (i = 0; i < size; i++)
			rwlock_init(&hashinfo->ehash_locks[i]);
	}
	hashinfo->ehash_locks_mask = size - 1;
	return 0;
}

static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
	if (hashinfo->ehash_locks) {
#ifdef CONFIG_NUMA
		unsigned int size = (hashinfo->ehash_locks_mask + 1) *
							sizeof(rwlock_t);
		if (size > PAGE_SIZE)
			vfree(hashinfo->ehash_locks);
		else
#else
		kfree(hashinfo->ehash_locks);
#endif
		hashinfo->ehash_locks = NULL;
	}
}

extern struct inet_bind_bucket *
		    inet_bind_bucket_create(struct kmem_cache *cachep,
					    struct inet_bind_hashbucket *head,
@@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
		sk->sk_hash = inet_sk_ehashfn(sk);
		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
		list = &head->chain;
		lock = &head->lock;
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
		write_lock(lock);
	}
	__sk_add_node(sk, list);
@@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
		inet_listen_wlock(hashinfo);
		lock = &hashinfo->lhash_lock;
	} else {
		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
		write_lock_bh(lock);
	}

@@ -354,9 +412,10 @@ static inline struct sock *
	 */
	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);

	prefetch(head->chain.first);
	read_lock(&head->lock);
	read_lock(lock);
	sk_for_each(sk, node, &head->chain) {
		if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
			goto hit; /* You sunk my battleship! */
@@ -369,7 +428,7 @@ static inline struct sock *
	}
	sk = NULL;
out:
	read_unlock(&head->lock);
	read_unlock(lock);
	return sk;
hit:
	sock_hold(sk);
+7 −2
Original line number Diff line number Diff line
@@ -1072,11 +1072,13 @@ static int __init dccp_init(void)
	}

	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
		rwlock_init(&dccp_hashinfo.ehash[i].lock);
		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
	}

	if (inet_ehash_locks_alloc(&dccp_hashinfo))
			goto out_free_dccp_ehash;

	bhash_order = ehash_order;

	do {
@@ -1091,7 +1093,7 @@ static int __init dccp_init(void)

	if (!dccp_hashinfo.bhash) {
		DCCP_CRIT("Failed to allocate DCCP bind hash table");
		goto out_free_dccp_ehash;
		goto out_free_dccp_locks;
	}

	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -1121,6 +1123,8 @@ static int __init dccp_init(void)
out_free_dccp_bhash:
	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
	dccp_hashinfo.bhash = NULL;
out_free_dccp_locks:
	inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
	dccp_hashinfo.ehash = NULL;
@@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void)
	free_pages((unsigned long)dccp_hashinfo.ehash,
		   get_order(dccp_hashinfo.ehash_size *
			     sizeof(struct inet_ehash_bucket)));
	inet_ehash_locks_free(&dccp_hashinfo);
	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
	dccp_ackvec_exit();
	dccp_sysctl_exit();
+5 −4
Original line number Diff line number Diff line
@@ -747,13 +747,14 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)

	for (i = s_i; i < hashinfo->ehash_size; i++) {
		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
		rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
		struct sock *sk;
		struct hlist_node *node;

		if (i > s_i)
			s_num = 0;

		read_lock_bh(&head->lock);
		read_lock_bh(lock);
		num = 0;
		sk_for_each(sk, node, &head->chain) {
			struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
			    r->id.idiag_dport)
				goto next_normal;
			if (inet_csk_diag_dump(sk, skb, cb) < 0) {
				read_unlock_bh(&head->lock);
				read_unlock_bh(lock);
				goto done;
			}
next_normal:
@@ -791,14 +792,14 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
				    r->id.idiag_dport)
					goto next_dying;
				if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
					read_unlock_bh(&head->lock);
					read_unlock_bh(lock);
					goto done;
				}
next_dying:
				++num;
			}
		}
		read_unlock_bh(&head->lock);
		read_unlock_bh(lock);
	}

done:
+4 −3
Original line number Diff line number Diff line
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
	struct sock *sk2;
	const struct hlist_node *node;
	struct inet_timewait_sock *tw;

	prefetch(head->chain.first);
	write_lock(&head->lock);
	write_lock(lock);

	/* Check TIME-WAIT sockets first. */
	sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
	BUG_TRAP(sk_unhashed(sk));
	__sk_add_node(sk, &head->chain);
	sock_prot_inc_use(sk->sk_prot);
	write_unlock(&head->lock);
	write_unlock(lock);

	if (twp) {
		*twp = tw;
@@ -255,7 +256,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
	return 0;

not_unique:
	write_unlock(&head->lock);
	write_unlock(lock);
	return -EADDRNOTAVAIL;
}

+7 −6
Original line number Diff line number Diff line
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
	struct inet_bind_hashbucket *bhead;
	struct inet_bind_bucket *tb;
	/* Unlink from established hashes. */
	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
	rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);

	write_lock(&ehead->lock);
	write_lock(lock);
	if (hlist_unhashed(&tw->tw_node)) {
		write_unlock(&ehead->lock);
		write_unlock(lock);
		return;
	}
	__hlist_del(&tw->tw_node);
	sk_node_init(&tw->tw_node);
	write_unlock(&ehead->lock);
	write_unlock(lock);

	/* Disassociate with bind bucket. */
	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
	const struct inet_sock *inet = inet_sk(sk);
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
	rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
	struct inet_bind_hashbucket *bhead;
	/* Step 1: Put TW into bind hash. Original socket stays there too.
	   Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
	spin_unlock(&bhead->lock);

	write_lock(&ehead->lock);
	write_lock(lock);

	/* Step 2: Remove SK from established hash. */
	if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
	inet_twsk_add_node(tw, &ehead->twchain);
	atomic_inc(&tw->tw_refcnt);

	write_unlock(&ehead->lock);
	write_unlock(lock);
}

EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
Loading