Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 93bb0ceb authored by Jesper Dangaard Brouer's avatar Jesper Dangaard Brouer Committed by Pablo Neira Ayuso
Browse files

netfilter: conntrack: remove central spinlock nf_conntrack_lock



nf_conntrack_lock is a monolithic lock and suffers from huge contention
on current generation servers (8 or more core/threads).

Perf locking congestion is clear on base kernel:

-  72.56%  ksoftirqd/6  [kernel.kallsyms]    [k] _raw_spin_lock_bh
   - _raw_spin_lock_bh
      + 25.33% init_conntrack
      + 24.86% nf_ct_delete_from_lists
      + 24.62% __nf_conntrack_confirm
      + 24.38% destroy_conntrack
      + 0.70% tcp_packet
+   2.21%  ksoftirqd/6  [kernel.kallsyms]    [k] fib_table_lookup
+   1.15%  ksoftirqd/6  [kernel.kallsyms]    [k] __slab_free
+   0.77%  ksoftirqd/6  [kernel.kallsyms]    [k] inet_getpeer
+   0.70%  ksoftirqd/6  [nf_conntrack]       [k] nf_ct_delete
+   0.55%  ksoftirqd/6  [ip_tables]          [k] ipt_do_table

This patch change conntrack locking and provides a huge performance
improvement.  SYN-flood attack tested on a 24-core E5-2695v2(ES) with
10Gbit/s ixgbe (with tool trafgen):

 Base kernel:   810.405 new conntrack/sec
 After patch: 2.233.876 new conntrack/sec

Notice other floods attack (SYN+ACK or ACK) can easily be deflected using:
 # iptables -A INPUT -m state --state INVALID -j DROP
 # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0

Use an array of hashed spinlocks to protect insertions/deletions of
conntracks into the hash table. 1024 spinlocks seem to give good
results, at minimal cost (4KB memory). Due to lockdep max depth,
1024 becomes 8 if CONFIG_LOCKDEP=y

The hash resize is a bit tricky, because we need to take all locks in
the array. A seqcount_t is used to synchronize the hash table users
with the resizing process.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Reviewed-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent ca7433df
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
            const struct nf_conntrack_l3proto *l3proto,
            const struct nf_conntrack_l4proto *proto);

extern spinlock_t nf_conntrack_lock ;
#ifdef CONFIG_LOCKDEP
# define CONNTRACK_LOCKS 8
#else
# define CONNTRACK_LOCKS 1024
#endif
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];

extern spinlock_t nf_conntrack_expect_lock;

+2 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@
#include <linux/list_nulls.h>
#include <linux/atomic.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/seqlock.h>

struct ctl_table_header;
struct nf_conntrack_ecache;
@@ -90,6 +91,7 @@ struct netns_ct {
	int			sysctl_checksum;

	unsigned int		htable_size;
	seqcount_t		generation;
	struct kmem_cache	*nf_conntrack_cachep;
	struct hlist_nulls_head	*hash;
	struct hlist_head	*expect_hash;
+159 −60
Original line number Diff line number Diff line
@@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
				      const struct nlattr *attr) __read_mostly;
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);

DEFINE_SPINLOCK(nf_conntrack_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_lock);
__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
EXPORT_SYMBOL_GPL(nf_conntrack_locks);

__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);

static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
{
	h1 %= CONNTRACK_LOCKS;
	h2 %= CONNTRACK_LOCKS;
	spin_unlock(&nf_conntrack_locks[h1]);
	if (h1 != h2)
		spin_unlock(&nf_conntrack_locks[h2]);
}

/* return true if we need to recompute hashes (in case hash table was resized) */
static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
				     unsigned int h2, unsigned int sequence)
{
	h1 %= CONNTRACK_LOCKS;
	h2 %= CONNTRACK_LOCKS;
	if (h1 <= h2) {
		spin_lock(&nf_conntrack_locks[h1]);
		if (h1 != h2)
			spin_lock_nested(&nf_conntrack_locks[h2],
					 SINGLE_DEPTH_NESTING);
	} else {
		spin_lock(&nf_conntrack_locks[h2]);
		spin_lock_nested(&nf_conntrack_locks[h1],
				 SINGLE_DEPTH_NESTING);
	}
	if (read_seqcount_retry(&net->ct.generation, sequence)) {
		nf_conntrack_double_unlock(h1, h2);
		return true;
	}
	return false;
}

static void nf_conntrack_all_lock(void)
{
	int i;

	for (i = 0; i < CONNTRACK_LOCKS; i++)
		spin_lock_nested(&nf_conntrack_locks[i], i);
}

static void nf_conntrack_all_unlock(void)
{
	int i;

	for (i = 0; i < CONNTRACK_LOCKS; i++)
		spin_unlock(&nf_conntrack_locks[i]);
}

unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);

@@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
	struct net *net = nf_ct_net(ct);
	unsigned int hash, reply_hash;
	u16 zone = nf_ct_zone(ct);
	unsigned int sequence;

	nf_ct_helper_destroy(ct);
	spin_lock_bh(&nf_conntrack_lock);
	/* Inside lock so preempt is disabled on module removal path.
	 * Otherwise we can get spurious warnings. */
	NF_CT_STAT_INC(net, delete_list);

	local_bh_disable();
	do {
		sequence = read_seqcount_begin(&net->ct.generation);
		hash = hash_conntrack(net, zone,
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
		reply_hash = hash_conntrack(net, zone,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	clean_from_lists(ct);
	nf_conntrack_double_unlock(hash, reply_hash);

	nf_ct_add_to_dying_list(ct);
	spin_unlock_bh(&nf_conntrack_lock);

	NF_CT_STAT_INC(net, delete_list);
	local_bh_enable();
}

static void death_by_event(unsigned long ul_conntrack)
@@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 * Warning :
 * - Caller must take a reference on returned object
 *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 * OR
 * - Caller must lock nf_conntrack_lock before calling this function
 */
static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, u16 zone,
@@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	u16 zone;
	unsigned int sequence;

	zone = nf_ct_zone(ct);

	local_bh_disable();
	do {
		sequence = read_seqcount_begin(&net->ct.generation);
		hash = hash_conntrack(net, zone,
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
		reply_hash = hash_conntrack(net, zone,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

	spin_lock_bh(&nf_conntrack_lock);
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	/* See if there's one in the list already, including reverse */
	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
@@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	/* The caller holds a reference to this object */
	atomic_set(&ct->ct_general.use, 2);
	__nf_conntrack_hash_insert(ct, hash, reply_hash);
	nf_conntrack_double_unlock(hash, reply_hash);
	NF_CT_STAT_INC(net, insert);
	spin_unlock_bh(&nf_conntrack_lock);

	local_bh_enable();
	return 0;

out:
	nf_conntrack_double_unlock(hash, reply_hash);
	NF_CT_STAT_INC(net, insert_failed);
	spin_unlock_bh(&nf_conntrack_lock);
	local_bh_enable();
	return -EEXIST;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	enum ip_conntrack_info ctinfo;
	struct net *net;
	u16 zone;
	unsigned int sequence;

	ct = nf_ct_get(skb, &ctinfo);
	net = nf_ct_net(ct);
@@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		return NF_ACCEPT;

	zone = nf_ct_zone(ct);
	local_bh_disable();

	do {
		sequence = read_seqcount_begin(&net->ct.generation);
		/* reuse the hash saved before */
		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
		hash = hash_bucket(hash, net);
		reply_hash = hash_conntrack(net, zone,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	/* We're not in hash table, and we refuse to set up related
	   connections for unconfirmed conns.  But packet copies and
	   REJECT will give spurious warnings here. */
	 * connections for unconfirmed conns.  But packet copies and
	 * REJECT will give spurious warnings here.
	 */
	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */

	/* No external references means no one else could have
	   confirmed us. */
	 * confirmed us.
	 */
	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
	pr_debug("Confirming conntrack %p\n", ct);

	spin_lock_bh(&nf_conntrack_lock);

	/* We have to check the DYING flag inside the lock to prevent
	   a race against nf_ct_get_next_corpse() possibly called from
	   user context, else we insert an already 'dead' hash, blocking
	   further use of that particular connection -JM */

	if (unlikely(nf_ct_is_dying(ct))) {
		spin_unlock_bh(&nf_conntrack_lock);
		nf_conntrack_double_unlock(hash, reply_hash);
		local_bh_enable();
		return NF_ACCEPT;
	}

@@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	 * stores are visible.
	 */
	__nf_conntrack_hash_insert(ct, hash, reply_hash);
	nf_conntrack_double_unlock(hash, reply_hash);
	NF_CT_STAT_INC(net, insert);
	spin_unlock_bh(&nf_conntrack_lock);
	local_bh_enable();

	help = nfct_help(ct);
	if (help && help->helper)
@@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	return NF_ACCEPT;

out:
	nf_conntrack_double_unlock(hash, reply_hash);
	NF_CT_STAT_INC(net, insert_failed);
	spin_unlock_bh(&nf_conntrack_lock);
	local_bh_enable();
	return NF_DROP;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);

/* There's a small race here where we may free a just-assured
   connection.  Too bad: we're in trouble anyway. */
static noinline int early_drop(struct net *net, unsigned int hash)
static noinline int early_drop(struct net *net, unsigned int _hash)
{
	/* Use oldest entry, which is roughly LRU */
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct = NULL, *tmp;
	struct hlist_nulls_node *n;
	unsigned int i, cnt = 0;
	unsigned int i = 0, cnt = 0;
	int dropped = 0;
	unsigned int hash, sequence;
	spinlock_t *lockp;

	rcu_read_lock();
	for (i = 0; i < net->ct.htable_size; i++) {
	local_bh_disable();
restart:
	sequence = read_seqcount_begin(&net->ct.generation);
	hash = hash_bucket(_hash, net);
	for (; i < net->ct.htable_size; i++) {
		lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
		spin_lock(lockp);
		if (read_seqcount_retry(&net->ct.generation, sequence)) {
			spin_unlock(lockp);
			goto restart;
		}
		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
					 hnnode) {
			tmp = nf_ct_tuplehash_to_ctrack(h);
			if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
			if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
			    !nf_ct_is_dying(tmp) &&
			    atomic_inc_not_zero(&tmp->ct_general.use)) {
				ct = tmp;
				break;
			}
			cnt++;
		}

		if (ct != NULL) {
			if (likely(!nf_ct_is_dying(ct) &&
				   atomic_inc_not_zero(&ct->ct_general.use)))
				break;
			else
				ct = NULL;
		}
		hash = (hash + 1) % net->ct.htable_size;
		spin_unlock(lockp);

		if (cnt >= NF_CT_EVICTION_RANGE)
		if (ct || cnt >= NF_CT_EVICTION_RANGE)
			break;

		hash = (hash + 1) % net->ct.htable_size;
	}
	rcu_read_unlock();
	local_bh_enable();

	if (!ct)
		return dropped;
@@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,

	if (nf_conntrack_max &&
	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
		if (!early_drop(net, hash_bucket(hash, net))) {
		if (!early_drop(net, hash)) {
			atomic_dec(&net->ct.count);
			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
			return ERR_PTR(-ENOMEM);
@@ -1304,9 +1386,13 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
	struct nf_conn *ct;
	struct hlist_nulls_node *n;
	int cpu;
	spinlock_t *lockp;

	spin_lock_bh(&nf_conntrack_lock);
	for (; *bucket < net->ct.htable_size; (*bucket)++) {
		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
		local_bh_disable();
		spin_lock(lockp);
		if (*bucket < net->ct.htable_size) {
			hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
					continue;
@@ -1315,7 +1401,9 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
					goto found;
			}
		}
	spin_unlock_bh(&nf_conntrack_lock);
		spin_unlock(lockp);
		local_bh_enable();
	}

	for_each_possible_cpu(cpu) {
		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
@@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
	return NULL;
found:
	atomic_inc(&ct->ct_general.use);
	spin_unlock_bh(&nf_conntrack_lock);
	spin_unlock(lockp);
	local_bh_enable();
	return ct;
}

@@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
	if (!hash)
		return -ENOMEM;

	local_bh_disable();
	nf_conntrack_all_lock();
	write_seqcount_begin(&init_net.ct.generation);

	/* Lookups in the old hash might happen in parallel, which means we
	 * might get false negatives during connection lookup. New connections
	 * created because of a false negative won't make it into the hash
	 * though since that required taking the lock.
	 * though since that required taking the locks.
	 */
	spin_lock_bh(&nf_conntrack_lock);

	for (i = 0; i < init_net.ct.htable_size; i++) {
		while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
			h = hlist_nulls_entry(init_net.ct.hash[i].first,
@@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)

	init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
	init_net.ct.hash = hash;
	spin_unlock_bh(&nf_conntrack_lock);

	write_seqcount_end(&init_net.ct.generation);
	nf_conntrack_all_unlock();
	local_bh_enable();

	nf_ct_free_hashtable(old_hash, old_size);
	return 0;
@@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void)
{
	int max_factor = 8;
	int ret, cpu;
	int i, ret, cpu;

	for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
		spin_lock_init(&nf_conntrack_locks[i]);

	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
+8 −4
Original line number Diff line number Diff line
@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
			unhelp(h, me);
		spin_unlock_bh(&pcpu->lock);
	}
	spin_lock_bh(&nf_conntrack_lock);
	local_bh_disable();
	for (i = 0; i < net->ct.htable_size; i++) {
		spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
		if (i < net->ct.htable_size) {
			hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
				unhelp(h, me);
		}
	spin_unlock_bh(&nf_conntrack_lock);
		spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
	}
	local_bh_enable();
}

void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+13 −2
Original line number Diff line number Diff line
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
	u_int8_t l3proto = nfmsg->nfgen_family;
	int res;
	spinlock_t *lockp;

#ifdef CONFIG_NF_CONNTRACK_MARK
	const struct ctnetlink_dump_filter *filter = cb->data;
#endif

	spin_lock_bh(&nf_conntrack_lock);
	last = (struct nf_conn *)cb->args[1];

	local_bh_disable();
	for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart:
		lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
		spin_lock(lockp);
		if (cb->args[0] >= net->ct.htable_size) {
			spin_unlock(lockp);
			goto out;
		}
		hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
					 hnnode) {
			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -803,16 +812,18 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
			if (res < 0) {
				nf_conntrack_get(&ct->ct_general);
				cb->args[1] = (unsigned long)ct;
				spin_unlock(lockp);
				goto out;
			}
		}
		spin_unlock(lockp);
		if (cb->args[1]) {
			cb->args[1] = 0;
			goto restart;
		}
	}
out:
	spin_unlock_bh(&nf_conntrack_lock);
	local_bh_enable();
	if (last)
		nf_ct_put(last);