Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5caea4ea authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net: listening_hash get a spinlock per bucket



This patch prepares RCU migration of listening_hash table for
TCP/DCCP protocols.

listening_hash table being small (32 slots per protocol), we add
a spinlock for each slot, instead of a single rwlock for whole table.

This should reduce hold time of readers, and writers concurrency.

Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d8b83c57
Loading
Loading
Loading
Loading
+15 −30
Original line number Diff line number Diff line
@@ -99,6 +99,11 @@ struct inet_bind_hashbucket {
	struct hlist_head	chain;
};

struct inet_listen_hashbucket {
	spinlock_t		lock;
	struct hlist_head	head;
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */

@@ -123,22 +128,21 @@ struct inet_hashinfo {
	unsigned int			bhash_size;
	/* Note : 4 bytes padding on 64 bit arches */

	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct hlist_head		listening_hash[INET_LHTABLE_SIZE];
	struct kmem_cache		*bind_bucket_cachep;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * are often dirty.
	 * might be often dirty.
	 */
	rwlock_t			lhash_lock ____cacheline_aligned;
	atomic_t			lhash_users;
	wait_queue_head_t		lhash_wait;
	struct kmem_cache			*bind_bucket_cachep;
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;

};

static inline struct inet_ehash_bucket *inet_ehash_bucket(
@@ -236,26 +240,7 @@ extern void __inet_inherit_port(struct sock *sk, struct sock *child);

extern void inet_put_port(struct sock *sk);

extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);

/*
 * - We may sleep inside this lock.
 * - If sleeping is not required (or called from BH),
 *   use plain read_(un)lock(&inet_hashinfo.lhash_lock).
 */
static inline void inet_listen_lock(struct inet_hashinfo *hashinfo)
{
	/* read_lock synchronizes to candidates to writers */
	read_lock(&hashinfo->lhash_lock);
	atomic_inc(&hashinfo->lhash_users);
	read_unlock(&hashinfo->lhash_lock);
}

static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
{
	if (atomic_dec_and_test(&hashinfo->lhash_users))
		wake_up(&hashinfo->lhash_wait);
}
void inet_hashinfo_init(struct inet_hashinfo *h);

extern void __inet_hash_nolisten(struct sock *sk);
extern void inet_hash(struct sock *sk);
+2 −6
Original line number Diff line number Diff line
@@ -44,12 +44,7 @@ atomic_t dccp_orphan_count = ATOMIC_INIT(0);

EXPORT_SYMBOL_GPL(dccp_orphan_count);

struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
	.lhash_lock	= RW_LOCK_UNLOCKED,
	.lhash_users	= ATOMIC_INIT(0),
	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
};

struct inet_hashinfo dccp_hashinfo;
EXPORT_SYMBOL_GPL(dccp_hashinfo);

/* the maximum queue length for tx in packets. 0 is no limit */
@@ -1030,6 +1025,7 @@ static int __init dccp_init(void)
	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
		     FIELD_SIZEOF(struct sk_buff, cb));

	inet_hashinfo_init(&dccp_hashinfo);
	dccp_hashinfo.bind_bucket_cachep =
		kmem_cache_create("dccp_bind_bucket",
				  sizeof(struct inet_bind_bucket), 0,
+7 −5
Original line number Diff line number Diff line
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
		if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
			goto skip_listen_ht;

		inet_listen_lock(hashinfo);
		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
			struct sock *sk;
			struct hlist_node *node;
			struct inet_listen_hashbucket *ilb;

			num = 0;
			sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
			ilb = &hashinfo->listening_hash[i];
			spin_lock_bh(&ilb->lock);
			sk_for_each(sk, node, &ilb->head) {
				struct inet_sock *inet = inet_sk(sk);

				if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
					goto syn_recv;

				if (inet_csk_diag_dump(sk, skb, cb) < 0) {
					inet_listen_unlock(hashinfo);
					spin_unlock_bh(&ilb->lock);
					goto done;
				}

@@ -751,7 +753,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
					goto next_listen;

				if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
					inet_listen_unlock(hashinfo);
					spin_unlock_bh(&ilb->lock);
					goto done;
				}

@@ -760,12 +762,12 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
				cb->args[4] = 0;
				++num;
			}
			spin_unlock_bh(&ilb->lock);

			s_num = 0;
			cb->args[3] = 0;
			cb->args[4] = 0;
		}
		inet_listen_unlock(hashinfo);
skip_listen_ht:
		cb->args[0] = 1;
		s_i = num = s_num = 0;
+31 −55
Original line number Diff line number Diff line
@@ -110,35 +110,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)

EXPORT_SYMBOL_GPL(__inet_inherit_port);

/*
 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 * Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines (wake up each
 * exclusive lock release). It should be ifdefed really.
 */
void inet_listen_wlock(struct inet_hashinfo *hashinfo)
	__acquires(hashinfo->lhash_lock)
{
	write_lock(&hashinfo->lhash_lock);

	if (atomic_read(&hashinfo->lhash_users)) {
		DEFINE_WAIT(wait);

		for (;;) {
			prepare_to_wait_exclusive(&hashinfo->lhash_wait,
						  &wait, TASK_UNINTERRUPTIBLE);
			if (!atomic_read(&hashinfo->lhash_users))
				break;
			write_unlock_bh(&hashinfo->lhash_lock);
			schedule();
			write_lock_bh(&hashinfo->lhash_lock);
		}

		finish_wait(&hashinfo->lhash_wait, &wait);
	}
}

/*
 * Don't inline this cruft. Here are some nice properties to exploit here. The
 * BSD API does not allow a listening sock to specify the remote port nor the
@@ -191,25 +162,25 @@ struct sock *__inet_lookup_listener(struct net *net,
				    const int dif)
{
	struct sock *sk = NULL;
	const struct hlist_head *head;
	struct inet_listen_hashbucket *ilb;

	read_lock(&hashinfo->lhash_lock);
	head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
	if (!hlist_empty(head)) {
		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
	spin_lock(&ilb->lock);
	if (!hlist_empty(&ilb->head)) {
		const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));

		if (inet->num == hnum && !sk->sk_node.next &&
		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
		    !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
			goto sherry_cache;
		sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
		sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
	}
	if (sk) {
sherry_cache:
		sock_hold(sk);
	}
	read_unlock(&hashinfo->lhash_lock);
	spin_unlock(&ilb->lock);
	return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -389,8 +360,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
static void __inet_hash(struct sock *sk)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct hlist_head *list;
	rwlock_t *lock;
	struct inet_listen_hashbucket *ilb;

	if (sk->sk_state != TCP_LISTEN) {
		__inet_hash_nolisten(sk);
@@ -398,14 +368,12 @@ static void __inet_hash(struct sock *sk)
	}

	WARN_ON(!sk_unhashed(sk));
	list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
	lock = &hashinfo->lhash_lock;
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

	inet_listen_wlock(hashinfo);
	__sk_add_node(sk, list);
	spin_lock(&ilb->lock);
	__sk_add_node(sk, &ilb->head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	write_unlock(lock);
	wake_up(&hashinfo->lhash_wait);
	spin_unlock(&ilb->lock);
}

void inet_hash(struct sock *sk)
@@ -420,29 +388,27 @@ EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
	rwlock_t *lock;
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;

	if (sk_unhashed(sk))
		goto out;
		return;

	if (sk->sk_state == TCP_LISTEN) {
		local_bh_disable();
		inet_listen_wlock(hashinfo);
		lock = &hashinfo->lhash_lock;
		struct inet_listen_hashbucket *ilb;

		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
		spin_lock_bh(&ilb->lock);
		if (__sk_del_node_init(sk))
			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
		spin_unlock_bh(&ilb->lock);
	} else {
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
		rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

		write_lock_bh(lock);
		if (__sk_nulls_del_node_init_rcu(sk))
			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	}

		write_unlock_bh(lock);
out:
	if (sk->sk_state == TCP_LISTEN)
		wake_up(&hashinfo->lhash_wait);
	}
}
EXPORT_SYMBOL_GPL(inet_unhash);

@@ -556,3 +522,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
}

EXPORT_SYMBOL_GPL(inet_hash_connect);

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

	for (i = 0; i < INET_LHTABLE_SIZE; i++)
		spin_lock_init(&h->listening_hash[i].lock);
}

EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+12 −12
Original line number Diff line number Diff line
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
}
#endif

struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
	.lhash_users = ATOMIC_INIT(0),
	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
};
struct inet_hashinfo tcp_hashinfo;

static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
{
@@ -1874,15 +1870,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
	struct inet_connection_sock *icsk;
	struct hlist_node *node;
	struct sock *sk = cur;
	struct inet_listen_hashbucket *ilb;
	struct tcp_iter_state *st = seq->private;
	struct net *net = seq_file_net(seq);

	if (!sk) {
		st->bucket = 0;
		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
		ilb = &tcp_hashinfo.listening_hash[0];
		spin_lock_bh(&ilb->lock);
		sk = sk_head(&ilb->head);
		goto get_sk;
	}

	ilb = &tcp_hashinfo.listening_hash[st->bucket];
	++st->num;

	if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1932,8 +1931,11 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
		}
		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
	}
	spin_unlock_bh(&ilb->lock);
	if (++st->bucket < INET_LHTABLE_SIZE) {
		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
		ilb = &tcp_hashinfo.listening_hash[st->bucket];
		spin_lock_bh(&ilb->lock);
		sk = sk_head(&ilb->head);
		goto get_sk;
	}
	cur = NULL;
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
	void *rc;
	struct tcp_iter_state *st = seq->private;

	inet_listen_lock(&tcp_hashinfo);
	st->state = TCP_SEQ_STATE_LISTENING;
	rc	  = listening_get_idx(seq, &pos);

	if (!rc) {
		inet_listen_unlock(&tcp_hashinfo);
		st->state = TCP_SEQ_STATE_ESTABLISHED;
		rc	  = established_get_idx(seq, pos);
	}
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
	case TCP_SEQ_STATE_LISTENING:
		rc = listening_get_next(seq, v);
		if (!rc) {
			inet_listen_unlock(&tcp_hashinfo);
			st->state = TCP_SEQ_STATE_ESTABLISHED;
			rc	  = established_get_first(seq);
		}
@@ -2130,7 +2129,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
		}
	case TCP_SEQ_STATE_LISTENING:
		if (v != SEQ_START_TOKEN)
			inet_listen_unlock(&tcp_hashinfo);
			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
		break;
	case TCP_SEQ_STATE_TIME_WAIT:
	case TCP_SEQ_STATE_ESTABLISHED:
@@ -2405,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {

void __init tcp_v4_init(void)
{
	inet_hashinfo_init(&tcp_hashinfo);
	if (register_pernet_device(&tcp_sk_ops))
		panic("Failed to create the TCP control socket.\n");
}
Loading