Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ea781f19 authored by Eric Dumazet's avatar Eric Dumazet Committed by Patrick McHardy
Browse files

netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu()



Use "hlist_nulls" infrastructure we added in 2.6.29 for RCUification of UDP & TCP.

This permits an easy conversion from call_rcu() based hash lists to a
SLAB_DESTROY_BY_RCU one.

Avoiding call_rcu() delay at nf_conn freeing time has numerous gains.

First, it doesnt fill RCU queues (up to 10000 elements per cpu).
This reduces OOM possibility, if queued elements are not taken into account
This reduces latency problems when RCU queue size hits hilimit and triggers
emergency mode.

- It allows fast reuse of just freed elements, permitting better use of
CPU cache.

- We delete rcu_head from "struct nf_conn", shrinking size of this structure
by 8 or 16 bytes.

This patch only takes care of "struct nf_conn".
call_rcu() is still used for less critical conntrack parts, that may
be converted later if necessary.

Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
parent 1f9352ae
Loading
Loading
Loading
Loading
+8 −6
Original line number Original line Diff line number Diff line
@@ -91,8 +91,7 @@ struct nf_conn_help {
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>


struct nf_conn
struct nf_conn {
{
	/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
	/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
           plus 1 for any connection(s) we are `master' for */
           plus 1 for any connection(s) we are `master' for */
	struct nf_conntrack ct_general;
	struct nf_conntrack ct_general;
@@ -126,7 +125,6 @@ struct nf_conn
#ifdef CONFIG_NET_NS
#ifdef CONFIG_NET_NS
	struct net *ct_net;
	struct net *ct_net;
#endif
#endif
	struct rcu_head rcu;
};
};


static inline struct nf_conn *
static inline struct nf_conn *
@@ -190,9 +188,13 @@ static inline void nf_ct_put(struct nf_conn *ct)
extern int nf_ct_l3proto_try_module_get(unsigned short l3proto);
extern int nf_ct_l3proto_try_module_get(unsigned short l3proto);
extern void nf_ct_l3proto_module_put(unsigned short l3proto);
extern void nf_ct_l3proto_module_put(unsigned short l3proto);


extern struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced);
/*
extern void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced,
 * Allocate a hashtable of hlist_head (if nulls == 0),
				 unsigned int size);
 * or hlist_nulls_head (if nulls == 1)
 */
extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls);

extern void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size);


extern struct nf_conntrack_tuple_hash *
extern struct nf_conntrack_tuple_hash *
__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
+3 −3
Original line number Original line Diff line number Diff line
@@ -12,6 +12,7 @@


#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/list_nulls.h>


/* A `tuple' is a structure containing the information to uniquely
/* A `tuple' is a structure containing the information to uniquely
  identify a connection.  ie. if two packets have the same tuple, they
  identify a connection.  ie. if two packets have the same tuple, they
@@ -146,9 +147,8 @@ static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t)
	((enum ip_conntrack_dir)(h)->tuple.dst.dir)
	((enum ip_conntrack_dir)(h)->tuple.dst.dir)


/* Connections have two entries in the hash table: one for each way */
/* Connections have two entries in the hash table: one for each way */
struct nf_conntrack_tuple_hash
struct nf_conntrack_tuple_hash {
{
	struct hlist_nulls_node hnnode;
	struct hlist_node hnode;
	struct nf_conntrack_tuple tuple;
	struct nf_conntrack_tuple tuple;
};
};


+3 −2
Original line number Original line Diff line number Diff line
@@ -2,6 +2,7 @@
#define __NETNS_CONNTRACK_H
#define __NETNS_CONNTRACK_H


#include <linux/list.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <asm/atomic.h>
#include <asm/atomic.h>


struct ctl_table_header;
struct ctl_table_header;
@@ -10,9 +11,9 @@ struct nf_conntrack_ecache;
struct netns_ct {
struct netns_ct {
	atomic_t		count;
	atomic_t		count;
	unsigned int		expect_count;
	unsigned int		expect_count;
	struct hlist_head	*hash;
	struct hlist_nulls_head	*hash;
	struct hlist_head	*expect_hash;
	struct hlist_head	*expect_hash;
	struct hlist_head	unconfirmed;
	struct hlist_nulls_head	unconfirmed;
	struct ip_conntrack_stat *stat;
	struct ip_conntrack_stat *stat;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
#ifdef CONFIG_NF_CONNTRACK_EVENTS
	struct nf_conntrack_ecache *ecache;
	struct nf_conntrack_ecache *ecache;
+36 −27
Original line number Original line Diff line number Diff line
@@ -25,40 +25,42 @@ struct ct_iter_state {
	unsigned int bucket;
	unsigned int bucket;
};
};


static struct hlist_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
{
	struct net *net = seq_file_net(seq);
	struct net *net = seq_file_net(seq);
	struct ct_iter_state *st = seq->private;
	struct ct_iter_state *st = seq->private;
	struct hlist_node *n;
	struct hlist_nulls_node *n;


	for (st->bucket = 0;
	for (st->bucket = 0;
	     st->bucket < nf_conntrack_htable_size;
	     st->bucket < nf_conntrack_htable_size;
	     st->bucket++) {
	     st->bucket++) {
		n = rcu_dereference(net->ct.hash[st->bucket].first);
		n = rcu_dereference(net->ct.hash[st->bucket].first);
		if (n)
		if (!is_a_nulls(n))
			return n;
			return n;
	}
	}
	return NULL;
	return NULL;
}
}


static struct hlist_node *ct_get_next(struct seq_file *seq,
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
				      struct hlist_node *head)
				      struct hlist_nulls_node *head)
{
{
	struct net *net = seq_file_net(seq);
	struct net *net = seq_file_net(seq);
	struct ct_iter_state *st = seq->private;
	struct ct_iter_state *st = seq->private;


	head = rcu_dereference(head->next);
	head = rcu_dereference(head->next);
	while (head == NULL) {
	while (is_a_nulls(head)) {
		if (likely(get_nulls_value(head) == st->bucket)) {
			if (++st->bucket >= nf_conntrack_htable_size)
			if (++st->bucket >= nf_conntrack_htable_size)
				return NULL;
				return NULL;
		}
		head = rcu_dereference(net->ct.hash[st->bucket].first);
		head = rcu_dereference(net->ct.hash[st->bucket].first);
	}
	}
	return head;
	return head;
}
}


static struct hlist_node *ct_get_idx(struct seq_file *seq, loff_t pos)
static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
{
{
	struct hlist_node *head = ct_get_first(seq);
	struct hlist_nulls_node *head = ct_get_first(seq);


	if (head)
	if (head)
		while (pos && (head = ct_get_next(seq, head)))
		while (pos && (head = ct_get_next(seq, head)))
@@ -87,69 +89,76 @@ static void ct_seq_stop(struct seq_file *s, void *v)


static int ct_seq_show(struct seq_file *s, void *v)
static int ct_seq_show(struct seq_file *s, void *v)
{
{
	const struct nf_conntrack_tuple_hash *hash = v;
	struct nf_conntrack_tuple_hash *hash = v;
	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
	const struct nf_conntrack_l3proto *l3proto;
	const struct nf_conntrack_l3proto *l3proto;
	const struct nf_conntrack_l4proto *l4proto;
	const struct nf_conntrack_l4proto *l4proto;
	int ret = 0;


	NF_CT_ASSERT(ct);
	NF_CT_ASSERT(ct);
	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
		return 0;



	/* we only want to print DIR_ORIGINAL */
	/* we only want to print DIR_ORIGINAL */
	if (NF_CT_DIRECTION(hash))
	if (NF_CT_DIRECTION(hash))
		return 0;
		goto release;
	if (nf_ct_l3num(ct) != AF_INET)
	if (nf_ct_l3num(ct) != AF_INET)
		return 0;
		goto release;


	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
	NF_CT_ASSERT(l3proto);
	NF_CT_ASSERT(l3proto);
	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
	NF_CT_ASSERT(l4proto);
	NF_CT_ASSERT(l4proto);


	ret = -ENOSPC;
	if (seq_printf(s, "%-8s %u %ld ",
	if (seq_printf(s, "%-8s %u %ld ",
		      l4proto->name, nf_ct_protonum(ct),
		      l4proto->name, nf_ct_protonum(ct),
		      timer_pending(&ct->timeout)
		      timer_pending(&ct->timeout)
		      ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
		      ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
		return -ENOSPC;
		goto release;


	if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
	if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
		return -ENOSPC;
		goto release;


	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
			l3proto, l4proto))
			l3proto, l4proto))
		return -ENOSPC;
		goto release;


	if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
	if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
		return -ENOSPC;
		goto release;


	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
		if (seq_printf(s, "[UNREPLIED] "))
		if (seq_printf(s, "[UNREPLIED] "))
			return -ENOSPC;
			goto release;


	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
			l3proto, l4proto))
			l3proto, l4proto))
		return -ENOSPC;
		goto release;


	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
		return -ENOSPC;
		goto release;


	if (test_bit(IPS_ASSURED_BIT, &ct->status))
	if (test_bit(IPS_ASSURED_BIT, &ct->status))
		if (seq_printf(s, "[ASSURED] "))
		if (seq_printf(s, "[ASSURED] "))
			return -ENOSPC;
			goto release;


#ifdef CONFIG_NF_CONNTRACK_MARK
#ifdef CONFIG_NF_CONNTRACK_MARK
	if (seq_printf(s, "mark=%u ", ct->mark))
	if (seq_printf(s, "mark=%u ", ct->mark))
		return -ENOSPC;
		goto release;
#endif
#endif


#ifdef CONFIG_NF_CONNTRACK_SECMARK
#ifdef CONFIG_NF_CONNTRACK_SECMARK
	if (seq_printf(s, "secmark=%u ", ct->secmark))
	if (seq_printf(s, "secmark=%u ", ct->secmark))
		return -ENOSPC;
		goto release;
#endif
#endif


	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
		return -ENOSPC;
		goto release;

	ret = 0;
	return 0;
release:
	nf_ct_put(ct);
	return ret;
}
}


static const struct seq_operations ct_seq_ops = {
static const struct seq_operations ct_seq_ops = {
+1 −1
Original line number Original line Diff line number Diff line
@@ -679,7 +679,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
static int __net_init nf_nat_net_init(struct net *net)
static int __net_init nf_nat_net_init(struct net *net)
{
{
	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
						      &net->ipv4.nat_vmalloced);
						      &net->ipv4.nat_vmalloced, 0);
	if (!net->ipv4.nat_bysource)
	if (!net->ipv4.nat_bysource)
		return -ENOMEM;
		return -ENOMEM;
	return 0;
	return 0;
Loading