Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b7779d06 authored by Jesper Dangaard Brouer's avatar Jesper Dangaard Brouer Committed by Pablo Neira Ayuso
Browse files

netfilter: conntrack: spinlock per cpu to protect special lists.



One spinlock per cpu to protect dying/unconfirmed/template special lists.
(These lists are now per cpu, a bit like the untracked ct)
Add a @cpu field to nf_conn, to make sure we hold the appropriate
spinlock at removal time.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Reviewed-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent b476b72a
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -83,6 +83,7 @@ struct nf_conn {
	struct nf_conntrack ct_general;

	spinlock_t	lock;
	u16		cpu;

	/* XXX should I move this to the tail ? - Y.K */
	/* These are my tuples; original and reply */
+8 −3
Original line number Diff line number Diff line
@@ -62,6 +62,13 @@ struct nf_ip_net {
#endif
};

struct ct_pcpu {
	spinlock_t		lock;
	struct hlist_nulls_head unconfirmed;
	struct hlist_nulls_head dying;
	struct hlist_nulls_head tmpl;
};

struct netns_ct {
	atomic_t		count;
	unsigned int		expect_count;
@@ -86,9 +93,7 @@ struct netns_ct {
	struct kmem_cache	*nf_conntrack_cachep;
	struct hlist_nulls_head	*hash;
	struct hlist_head	*expect_hash;
	struct hlist_nulls_head	unconfirmed;
	struct hlist_nulls_head	dying;
	struct hlist_nulls_head tmpl;
	struct ct_pcpu __percpu *pcpu_lists;
	struct ip_conntrack_stat __percpu *stat;
	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
	struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
+103 −38
Original line number Diff line number Diff line
@@ -192,6 +192,50 @@ clean_from_lists(struct nf_conn *ct)
	nf_ct_remove_expectations(ct);
}

/* must be called with local_bh_disable */
static void nf_ct_add_to_dying_list(struct nf_conn *ct)
{
	struct ct_pcpu *pcpu;

	/* add this conntrack to the (per cpu) dying list */
	ct->cpu = smp_processor_id();
	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);

	spin_lock(&pcpu->lock);
	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
			     &pcpu->dying);
	spin_unlock(&pcpu->lock);
}

/* must be called with local_bh_disable */
static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
{
	struct ct_pcpu *pcpu;

	/* add this conntrack to the (per cpu) unconfirmed list */
	ct->cpu = smp_processor_id();
	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);

	spin_lock(&pcpu->lock);
	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
			     &pcpu->unconfirmed);
	spin_unlock(&pcpu->lock);
}

/* must be called with local_bh_disable */
static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
{
	struct ct_pcpu *pcpu;

	/* We overload first tuple to link into unconfirmed or dying list.*/
	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);

	spin_lock(&pcpu->lock);
	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
	spin_unlock(&pcpu->lock);
}

static void
destroy_conntrack(struct nf_conntrack *nfct)
{
@@ -220,9 +264,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
	 * too. */
	nf_ct_remove_expectations(ct);

	/* We overload first tuple to link into unconfirmed or dying list.*/
	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
	nf_ct_del_from_dying_or_unconfirmed_list(ct);

	NF_CT_STAT_INC(net, delete);
	spin_unlock_bh(&nf_conntrack_lock);
@@ -244,9 +286,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
	 * Otherwise we can get spurious warnings. */
	NF_CT_STAT_INC(net, delete_list);
	clean_from_lists(ct);
	/* add this conntrack to the dying list */
	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
			     &net->ct.dying);
	nf_ct_add_to_dying_list(ct);
	spin_unlock_bh(&nf_conntrack_lock);
}

@@ -467,15 +507,22 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
/* deletion from this larval template list happens via nf_ct_put() */
void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
{
	struct ct_pcpu *pcpu;

	__set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
	nf_conntrack_get(&tmpl->ct_general);

	spin_lock_bh(&nf_conntrack_lock);
	/* add this conntrack to the (per cpu) tmpl list */
	local_bh_disable();
	tmpl->cpu = smp_processor_id();
	pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);

	spin_lock(&pcpu->lock);
	/* Overload tuple linked list to put us in template list. */
	hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
				 &net->ct.tmpl);
	spin_unlock_bh(&nf_conntrack_lock);
				 &pcpu->tmpl);
	spin_unlock_bh(&pcpu->lock);
}
EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);

@@ -546,8 +593,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
			goto out;

	/* Remove from unconfirmed list */
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
	nf_ct_del_from_dying_or_unconfirmed_list(ct);

	/* Timer relative to confirmation time, not original
	   setting time, otherwise we'd get timer wrap in
@@ -879,10 +925,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,

	/* Now it is inserted into the unconfirmed list, bump refcount */
	nf_conntrack_get(&ct->ct_general);

	/* Overload tuple linked list to put us in unconfirmed list. */
	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
		       &net->ct.unconfirmed);
	nf_ct_add_to_unconfirmed_list(ct);

	spin_unlock_bh(&nf_conntrack_lock);

@@ -1254,6 +1297,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
	struct hlist_nulls_node *n;
	int cpu;

	spin_lock_bh(&nf_conntrack_lock);
	for (; *bucket < net->ct.htable_size; (*bucket)++) {
@@ -1265,12 +1309,19 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
				goto found;
		}
	}
	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
	spin_unlock_bh(&nf_conntrack_lock);

	for_each_possible_cpu(cpu) {
		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);

		spin_lock_bh(&pcpu->lock);
		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
			ct = nf_ct_tuplehash_to_ctrack(h);
			if (iter(ct, data))
				set_bit(IPS_DYING_BIT, &ct->status);
		}
	spin_unlock_bh(&nf_conntrack_lock);
		spin_unlock_bh(&pcpu->lock);
	}
	return NULL;
found:
	atomic_inc(&ct->ct_general.use);
@@ -1323,14 +1374,19 @@ static void nf_ct_release_dying_list(struct net *net)
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
	struct hlist_nulls_node *n;
	int cpu;

	spin_lock_bh(&nf_conntrack_lock);
	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
	for_each_possible_cpu(cpu) {
		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);

		spin_lock_bh(&pcpu->lock);
		hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
			ct = nf_ct_tuplehash_to_ctrack(h);
			/* never fails to remove them, no listeners at this point */
			nf_ct_kill(ct);
		}
	spin_unlock_bh(&nf_conntrack_lock);
		spin_unlock_bh(&pcpu->lock);
	}
}

static int untrack_refs(void)
@@ -1417,6 +1473,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
		kmem_cache_destroy(net->ct.nf_conntrack_cachep);
		kfree(net->ct.slabname);
		free_percpu(net->ct.stat);
		free_percpu(net->ct.pcpu_lists);
	}
}

@@ -1629,37 +1686,43 @@ void nf_conntrack_init_end(void)

int nf_conntrack_init_net(struct net *net)
{
	int ret;
	int ret = -ENOMEM;
	int cpu;

	atomic_set(&net->ct.count, 0);
	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
	INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL);
	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
	if (!net->ct.stat) {
		ret = -ENOMEM;

	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
	if (!net->ct.pcpu_lists)
		goto err_stat;

	for_each_possible_cpu(cpu) {
		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);

		spin_lock_init(&pcpu->lock);
		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
		INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
	}

	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
	if (!net->ct.stat)
		goto err_pcpu_lists;

	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
	if (!net->ct.slabname) {
		ret = -ENOMEM;
	if (!net->ct.slabname)
		goto err_slabname;
	}

	net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
							sizeof(struct nf_conn), 0,
							SLAB_DESTROY_BY_RCU, NULL);
	if (!net->ct.nf_conntrack_cachep) {
		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
		ret = -ENOMEM;
		goto err_cache;
	}

	net->ct.htable_size = nf_conntrack_htable_size;
	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
	if (!net->ct.hash) {
		ret = -ENOMEM;
		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
		goto err_hash;
	}
@@ -1701,6 +1764,8 @@ int nf_conntrack_init_net(struct net *net)
	kfree(net->ct.slabname);
err_slabname:
	free_percpu(net->ct.stat);
err_pcpu_lists:
	free_percpu(net->ct.pcpu_lists);
err_stat:
	return ret;
}
+9 −2
Original line number Diff line number Diff line
@@ -396,6 +396,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
	const struct hlist_node *next;
	const struct hlist_nulls_node *nn;
	unsigned int i;
	int cpu;

	/* Get rid of expectations */
	for (i = 0; i < nf_ct_expect_hsize; i++) {
@@ -414,8 +415,14 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
	}

	/* Get rid of expecteds, set helpers to NULL. */
	hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
	for_each_possible_cpu(cpu) {
		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);

		spin_lock_bh(&pcpu->lock);
		hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
			unhelp(h, me);
		spin_unlock_bh(&pcpu->lock);
	}
	for (i = 0; i < net->ct.htable_size; i++) {
		hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
			unhelp(h, me);
+46 −35
Original line number Diff line number Diff line
@@ -1137,21 +1137,34 @@ static int ctnetlink_done_list(struct netlink_callback *cb)
}

static int
ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
		    struct hlist_nulls_head *list)
ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
{
	struct nf_conn *ct, *last;
	struct nf_conn *ct, *last = NULL;
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
	u_int8_t l3proto = nfmsg->nfgen_family;
	int res;
	int cpu;
	struct hlist_nulls_head *list;
	struct net *net = sock_net(skb->sk);

	if (cb->args[2])
		return 0;

	spin_lock_bh(&nf_conntrack_lock);
	if (cb->args[0] == nr_cpu_ids)
		return 0;

	for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
		struct ct_pcpu *pcpu;

		if (!cpu_possible(cpu))
			continue;

		pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
		spin_lock_bh(&pcpu->lock);
		last = (struct nf_conn *)cb->args[1];
		list = dying ? &pcpu->dying : &pcpu->unconfirmed;
restart:
		hlist_nulls_for_each_entry(h, n, list, hnnode) {
			ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1171,6 +1184,7 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
			if (res < 0) {
				nf_conntrack_get(&ct->ct_general);
				cb->args[1] = (unsigned long)ct;
				spin_unlock_bh(&pcpu->lock);
				goto out;
			}
		}
@@ -1179,8 +1193,9 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
			goto restart;
		} else
			cb->args[2] = 1;
		spin_unlock_bh(&pcpu->lock);
	}
out:
	spin_unlock_bh(&nf_conntrack_lock);
	if (last)
		nf_ct_put(last);

@@ -1190,9 +1205,7 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);

	return ctnetlink_dump_list(skb, cb, &net->ct.dying);
	return ctnetlink_dump_list(skb, cb, true);
}

static int
@@ -1214,9 +1227,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
static int
ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);

	return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
	return ctnetlink_dump_list(skb, cb, false);
}

static int