Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 81c3d547 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

[INET]: speedup inet (tcp/dccp) lookups



Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)

(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)

1) First some performance data :
--------------------------------

tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()

The most time critical code is :

sk_for_each(sk, node, &head->chain) {
     if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
         goto hit; /* You sunk my battleship! */
}

The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.

As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.

This can be problematic if some chains are very long.

2) The goal
-----------

The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.

3) Description of the patch
---------------------------

Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.

struct sock_common {
	unsigned short		skc_family;
	volatile unsigned char	skc_state;
	unsigned char		skc_reuse;
	int			skc_bound_dev_if;
	struct hlist_node	skc_node;
	struct hlist_node	skc_bind_node;
	atomic_t		skc_refcnt;
+	unsigned int		skc_hash;
	struct proto		*skc_prot;
};

Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.

Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)

File include/net/inet_hashtables.h

64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))
     ((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))   &&  \
     ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))

32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))                 &&  \
     (inet_sk(__sk)->daddr          == (__saddr))   &&  \
     (inet_sk(__sk)->rcv_saddr      == (__daddr))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))


- Adds a prefetch(head->chain.first) in 
__inet_lookup_established()/__tcp_v4_check_established() and 
__inet6_lookup_established()/__tcp_v6_check_established() and 
__dccp_v4_check_established() to bring into cache the first element of the 
list, before the {read|write}_lock(&head->lock);

Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Acked-by: default avatarArnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 399de50b
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -372,8 +372,9 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
#define inet_v6_ipv6only(__sk)		0
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */

#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif)	   \
	(((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))  	&& \
#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\
	(((__sk)->sk_hash == (__hash))				&& \
	 ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))  	&& \
	 ((__sk)->sk_family		== AF_INET6)		&& \
	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&& \
	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&& \
+1 −1
Original line number Diff line number Diff line
@@ -71,7 +71,7 @@ enum
	TCF_META_ID_SK_SNDBUF,
 	TCF_META_ID_SK_ALLOCS,
 	TCF_META_ID_SK_ROUTE_CAPS,
 	TCF_META_ID_SK_HASHENT,
 	TCF_META_ID_SK_HASH,
 	TCF_META_ID_SK_LINGERTIME,
 	TCF_META_ID_SK_ACK_BACKLOG,
 	TCF_META_ID_SK_MAX_ACK_BACKLOG,
+10 −11
Original line number Diff line number Diff line
@@ -26,19 +26,18 @@
struct inet_hashinfo;

/* I have no idea if this is a good hash for v6 or not. -DaveM */
static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport,
				const struct in6_addr *faddr, const u16 fport,
				const int ehash_size)
static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport,
				const struct in6_addr *faddr, const u16 fport)
{
	int hashent = (lport ^ fport);
	unsigned int hashent = (lport ^ fport);

	hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
	hashent ^= hashent >> 16;
	hashent ^= hashent >> 8;
	return (hashent & (ehash_size - 1));
	return hashent;
}

static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size)
static inline int inet6_sk_ehashfn(const struct sock *sk)
{
	const struct inet_sock *inet = inet_sk(sk);
	const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -46,7 +45,7 @@ static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size)
	const struct in6_addr *faddr = &np->daddr;
	const __u16 lport = inet->num;
	const __u16 fport = inet->dport;
	return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size);
	return inet6_ehashfn(laddr, lport, faddr, fport);
}

/*
@@ -69,14 +68,14 @@ static inline struct sock *
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
	const int hash = inet6_ehashfn(daddr, hnum, saddr, sport,
				       hashinfo->ehash_size);
	struct inet_ehash_bucket *head = &hashinfo->ehash[hash];
	unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);

	prefetch(head->chain.first);
	read_lock(&head->lock);
	sk_for_each(sk, node, &head->chain) {
		/* For IPV6 do the cheaper port and family tests first. */
		if (INET6_MATCH(sk, saddr, daddr, ports, dif))
		if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
			goto hit; /* You sunk my battleship! */
	}
	/* Must check for a TIME_WAIT'er before going to listener hash. */
+38 −26
Original line number Diff line number Diff line
@@ -108,7 +108,7 @@ struct inet_hashinfo {
	struct inet_bind_hashbucket	*bhash;

	int				bhash_size;
	int				ehash_size;
	unsigned int			ehash_size;

	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
@@ -130,17 +130,16 @@ struct inet_hashinfo {
	int				port_rover;
};

static inline int inet_ehashfn(const __u32 laddr, const __u16 lport,
			       const __u32 faddr, const __u16 fport,
			       const int ehash_size)
static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
			       const __u32 faddr, const __u16 fport)
{
	int h = (laddr ^ lport) ^ (faddr ^ fport);
	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
	h ^= h >> 16;
	h ^= h >> 8;
	return h & (ehash_size - 1);
	return h;
}

static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size)
static inline int inet_sk_ehashfn(const struct sock *sk)
{
	const struct inet_sock *inet = inet_sk(sk);
	const __u32 laddr = inet->rcv_saddr;
@@ -148,7 +147,14 @@ static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size)
	const __u32 faddr = inet->daddr;
	const __u16 fport = inet->dport;

	return inet_ehashfn(laddr, lport, faddr, fport, ehash_size);
	return inet_ehashfn(laddr, lport, faddr, fport);
}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
	struct inet_hashinfo *hashinfo,
	unsigned int hash)
{
	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
}

extern struct inet_bind_bucket *
@@ -235,9 +241,11 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
		lock = &hashinfo->lhash_lock;
		inet_listen_wlock(hashinfo);
	} else {
		sk->sk_hashent = inet_sk_ehashfn(sk, hashinfo->ehash_size);
		list = &hashinfo->ehash[sk->sk_hashent].chain;
		lock = &hashinfo->ehash[sk->sk_hashent].lock;
		struct inet_ehash_bucket *head;
		sk->sk_hash = inet_sk_ehashfn(sk);
		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
		list = &head->chain;
		lock = &head->lock;
		write_lock(lock);
	}
	__sk_add_node(sk, list);
@@ -268,9 +276,8 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
		inet_listen_wlock(hashinfo);
		lock = &hashinfo->lhash_lock;
	} else {
		struct inet_ehash_bucket *head = &hashinfo->ehash[sk->sk_hashent];
		lock = &head->lock;
		write_lock_bh(&head->lock);
		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
		write_lock_bh(lock);
	}

	if (__sk_del_node_init(sk))
@@ -337,23 +344,27 @@ static inline struct sock *
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
	const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr));
#endif /* __BIG_ENDIAN */
#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
	(((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie))	&&	\
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
	(((__sk)->sk_hash == (__hash))				&&	\
	 ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie))	&&	\
	 ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
	(((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) &&	\
#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
	(((__sk)->sk_hash == (__hash))				&&	\
	 ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) &&	\
	 ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#else /* 32-bit arch */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr)
#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)	\
	((inet_sk(__sk)->daddr		== (__saddr))		&&	\
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)	\
	(((__sk)->sk_hash == (__hash))				&&	\
	 (inet_sk(__sk)->daddr		== (__saddr))		&&	\
	 (inet_sk(__sk)->rcv_saddr	== (__daddr))		&&	\
	 ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)	\
	((inet_twsk(__sk)->tw_daddr	== (__saddr))		&&	\
#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif)	\
	(((__sk)->sk_hash == (__hash))				&&	\
	 (inet_twsk(__sk)->tw_daddr	== (__saddr))		&&	\
	 (inet_twsk(__sk)->tw_rcv_saddr	== (__daddr))		&&	\
	 ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
@@ -378,18 +389,19 @@ static inline struct sock *
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
	const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size);
	struct inet_ehash_bucket *head = &hashinfo->ehash[hash];
	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);

	prefetch(head->chain.first);
	read_lock(&head->lock);
	sk_for_each(sk, node, &head->chain) {
		if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
		if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
			goto hit; /* You sunk my battleship! */
	}

	/* Must check for a TIME_WAIT'er before going to listener hash. */
	sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
		if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
		if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
			goto hit;
	}
	sk = NULL;
+1 −1
Original line number Diff line number Diff line
@@ -112,6 +112,7 @@ struct inet_timewait_sock {
#define tw_node			__tw_common.skc_node
#define tw_bind_node		__tw_common.skc_bind_node
#define tw_refcnt		__tw_common.skc_refcnt
#define tw_hash			__tw_common.skc_hash
#define tw_prot			__tw_common.skc_prot
	volatile unsigned char	tw_substate;
	/* 3 bits hole, try to pack */
@@ -126,7 +127,6 @@ struct inet_timewait_sock {
	/* And these are ours. */
	__u8			tw_ipv6only:1;
	/* 31 bits hole, try to pack */
	int			tw_hashent;
	int			tw_timeout;
	unsigned long		tw_ttd;
	struct inet_bind_bucket	*tw_tb;
Loading