Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 79134e6c authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net: do not create fallback tunnels for non-default namespaces



fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.

These tunnels are also automatically created when a new network
namespace is created, at a great cost.

In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)

Add a new sysctl so that we can opt-out from this automatic creation.

Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.

Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
 (for j in `seq 1 100` ; do  unshare -n /bin/true >/dev/null ; done) &
done
wait

lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m37.521s
user	0m0.886s
sys	7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m4.761s
user	0m0.851s
sys	1m8.343s
lpk43:~#

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 2b3905de
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -270,6 +270,18 @@ optmem_max
Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
of struct cmsghdr structures with appended data.

fb_tunnels_only_for_init_net
----------------------------

Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
sit0, ip6tnl0, ip6gre0) are automatically created when a new
network namespace is created, if corresponding tunnel is present
in initial network namespace.
If set to 1, these devices are not automatically created, and
user space is responsible for creating them if needed.

Default : 0  (for compatibility reasons)

2. /proc/sys/net/unix - Parameters for Unix domain sockets
-------------------------------------------------------

+7 −0
Original line number Diff line number Diff line
@@ -585,6 +585,13 @@ struct netdev_queue {
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;

static inline bool net_has_fallback_tunnels(const struct net *net)
{
	return net == &init_net || !sysctl_fb_tunnels_only_for_init_net;
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
+2 −0
Original line number Diff line number Diff line
@@ -180,8 +180,10 @@ struct tnl_ptk_info {

struct ip_tunnel_net {
	struct net_device *fb_tunnel_dev;
	struct rtnl_link_ops *rtnl_link_ops;
	struct hlist_head tunnels[IP_TNL_HASH_SIZE];
	struct ip_tunnel __rcu *collect_md_tun;
	int type;
};

static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
+12 −0
Original line number Diff line number Diff line
@@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;

static int net_msg_warn;	/* Unused, but still a sysctl */

int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);

#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
				void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = {
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
	},
	{
		.procname	= "fb_tunnels_only_for_init_net",
		.data		= &sysctl_fb_tunnels_only_for_init_net,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
		.extra2		= &one,
	},
	{ }
};

+12 −8
Original line number Diff line number Diff line
@@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
	struct net_device *dev;
	int t_hlen;

	BUG_ON(!itn->fb_tunnel_dev);
	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
	if (IS_ERR(dev))
		return ERR_CAST(dev);

@@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
	struct net *net = t->net;
	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);

	BUG_ON(!itn->fb_tunnel_dev);
	switch (cmd) {
	case SIOCGETTUNNEL:
		if (dev == itn->fb_tunnel_dev) {
@@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
				p->o_key = 0;
		}

		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
		t = ip_tunnel_find(itn, p, itn->type);

		if (cmd == SIOCADDTUNNEL) {
			if (!t) {
@@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
	struct ip_tunnel_parm parms;
	unsigned int i;

	itn->rtnl_link_ops = ops;
	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
		INIT_HLIST_HEAD(&itn->tunnels[i]);

	if (!ops) {
	if (!ops || !net_has_fallback_tunnels(net)) {
		struct ip_tunnel_net *it_init_net;

		it_init_net = net_generic(&init_net, ip_tnl_net_id);
		itn->type = it_init_net->type;
		itn->fb_tunnel_dev = NULL;
		return 0;
	}
@@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
		itn->type = itn->fb_tunnel_dev->type;
	}
	rtnl_unlock();

@@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);

static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
			      struct list_head *head,
			      struct rtnl_link_ops *ops)
{
	struct net *net = dev_net(itn->fb_tunnel_dev);
	struct net_device *dev, *aux;
	int h;

@@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
	rtnl_lock();
	list_for_each_entry(net, net_list, exit_list) {
		itn = net_generic(net, id);
		ip_tunnel_destroy(itn, &list, ops);
		ip_tunnel_destroy(net, itn, &list, ops);
	}
	unregister_netdevice_many(&list);
	rtnl_unlock();
Loading