Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4d95b72f authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'netns-scalability'



Nicolas Dichtel says:

====================
netns: ease netlink use with a lot of netns

This idea was informally discussed in Ottawa / netdev0.1. The goal is to
ease the use/scalability of netns, from a userland point of view.
Today, users need to open one netlink socket per family and per netns.
Thus, when the number of netns inscreases (for example 5K or more), the
number of sockets needed to manage them grows a lot.

The goal of this series is to be able to monitor netlink events, for a
specified family, for a set of netns, with only one netlink socket. For
this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID.
When this option is set on a netlink socket, this socket will receive
netlink notifications from all netns that have a nsid assigned into the
netns where the socket has been opened.
The nsid is sent to userland via an anscillary data.

Here is an example with a patched iproute2. vxlan10 is created in the
current netns (netns0, nsid 0) and then moved to another netns (netns1,
nsid 1):

$ ip netns exec netns0 ip monitor all-nsid label
[nsid 0][NSID]nsid 1 (iproute2 netns name: netns1)
[nsid 0][NEIGH]??? lladdr 00:00:00:00:00:00 REACHABLE,PERMANENT
[nsid 0][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 0][LINK]Deleted 5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 1][NSID]nsid 0 (iproute2 netns name: netns0)
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ADDR]5: vxlan10    inet 192.168.0.249/24 brd 192.168.0.255 scope global vxlan10
       valid_lft forever preferred_lft forever
[nsid 1][ROUTE]local 192.168.0.249 dev vxlan10  table local  proto kernel  scope host  src 192.168.0.249
[nsid 1][ROUTE]ff00::/8 dev vxlan10  table local  metric 256  pref medium
[nsid 1][ROUTE]2001:123::/64 dev vxlan10  proto kernel  metric 256  pref medium
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ROUTE]broadcast 192.168.0.255 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]192.168.0.0/24 dev vxlan10  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]broadcast 192.168.0.0 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]fe80::/64 dev vxlan10  proto kernel  metric 256  pref medium
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 43996fdd 59324cf3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,

	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
	    nla_put_s32(skb, NDA_LINK_NETNSID,
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
			peernet2id_alloc(dev_net(vxlan->dev), vxlan->net)))
		goto nla_put_failure;

	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+2 −0
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ struct netlink_skb_parms {
	__u32			dst_group;
	__u32			flags;
	struct sock		*sk;
	bool			nsid_is_set;
	int			nsid;
};

#define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
+2 −0
Original line number Diff line number Diff line
@@ -271,7 +271,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
#define __net_initconst	__initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer);
int peernet2id(struct net *net, struct net *peer);
bool peernet_has_id(struct net *net, struct net *peer);
struct net *get_net_ns_by_id(struct net *net, int id);

struct pernet_operations {
+1 −0
Original line number Diff line number Diff line
@@ -108,6 +108,7 @@ struct nlmsgerr {
#define NETLINK_NO_ENOBUFS	5
#define NETLINK_RX_RING		6
#define NETLINK_TX_RING		7
#define NETLINK_LISTEN_ALL_NSID	8

struct nl_pktinfo {
	__u32	group;
+86 −46
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
DEFINE_MUTEX(net_mutex);
static DEFINE_SPINLOCK(nsid_lock);

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -147,24 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops,
	}
}

static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
			      int id);
/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
	int min = 0, max = 0, id;

	ASSERT_RTNL();
	int min = 0, max = 0;

	if (reqid >= 0) {
		min = reqid;
		max = reqid + 1;
	}

	id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
	if (id >= 0)
		rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);

	return id;
	return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
@@ -180,11 +174,16 @@ static int net_eq_idr(int id, void *net, void *peer)
	return 0;
}

static int __peernet2id(struct net *net, struct net *peer, bool alloc)
/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
 * is set to true, thus the caller knows that the new id must be notified via
 * rtnl.
 */
static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
	int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
	bool alloc_it = *alloc;

	ASSERT_RTNL();
	*alloc = false;

	/* Magic value for id 0. */
	if (id == NET_ID_ZERO)
@@ -192,36 +191,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
	if (id > 0)
		return id;

	if (alloc)
		return alloc_netid(net, peer, -1);
	if (alloc_it) {
		id = alloc_netid(net, peer, -1);
		*alloc = true;
		return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
	}

	return -ENOENT;
	return NETNSA_NSID_NOT_ASSIGNED;
}

/* should be called with nsid_lock held */
static int __peernet2id(struct net *net, struct net *peer)
{
	bool no = false;

	return __peernet2id_alloc(net, peer, &no);
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id);
/* This function returns the id of a peer netns. If no id is assigned, one will
 * be allocated and returned.
 */
int peernet2id_alloc(struct net *net, struct net *peer)
{
	unsigned long flags;
	bool alloc;
	int id;

	spin_lock_irqsave(&nsid_lock, flags);
	alloc = atomic_read(&peer->count) == 0 ? false : true;
	id = __peernet2id_alloc(net, peer, &alloc);
	spin_unlock_irqrestore(&nsid_lock, flags);
	if (alloc && id >= 0)
		rtnl_net_notifyid(net, RTM_NEWNSID, id);
	return id;
}
EXPORT_SYMBOL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
	unsigned long flags;
	int id;

	id = __peernet2id(net, peer, alloc);
	return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
	spin_lock_irqsave(&nsid_lock, flags);
	id = __peernet2id(net, peer);
	spin_unlock_irqrestore(&nsid_lock, flags);
	return id;
}

/* This function returns true is the peer netns has an id assigned into the
 * current netns.
 */
bool peernet_has_id(struct net *net, struct net *peer)
{
	return peernet2id(net, peer) >= 0;
}
EXPORT_SYMBOL(peernet2id);

struct net *get_net_ns_by_id(struct net *net, int id)
{
	unsigned long flags;
	struct net *peer;

	if (id < 0)
		return NULL;

	rcu_read_lock();
	spin_lock_irqsave(&nsid_lock, flags);
	peer = idr_find(&net->netns_ids, id);
	if (peer)
		get_net(peer);
	spin_unlock_irqrestore(&nsid_lock, flags);
	rcu_read_unlock();

	return peer;
@@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work)
		list_del_rcu(&net->list);
		list_add_tail(&net->exit_list, &net_exit_list);
		for_each_net(tmp) {
			int id = __peernet2id(tmp, net, false);
			int id;

			if (id >= 0) {
				rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
			spin_lock_irq(&nsid_lock);
			id = __peernet2id(tmp, net);
			if (id >= 0)
				idr_remove(&tmp->netns_ids, id);
			spin_unlock_irq(&nsid_lock);
			if (id >= 0)
				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
		}
		}
		spin_lock_irq(&nsid_lock);
		idr_destroy(&net->netns_ids);
		spin_unlock_irq(&nsid_lock);

	}
	rtnl_unlock();
@@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
	struct net *net = sock_net(skb->sk);
	struct nlattr *tb[NETNSA_MAX + 1];
	unsigned long flags;
	struct net *peer;
	int nsid, err;

@@ -517,14 +563,18 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
	if (IS_ERR(peer))
		return PTR_ERR(peer);

	if (__peernet2id(net, peer, false) >= 0) {
	spin_lock_irqsave(&nsid_lock, flags);
	if (__peernet2id(net, peer) >= 0) {
		err = -EEXIST;
		goto out;
	}

	err = alloc_netid(net, peer, nsid);
	if (err > 0)
	spin_unlock_irqrestore(&nsid_lock, flags);
	if (err >= 0) {
		rtnl_net_notifyid(net, RTM_NEWNSID, err);
		err = 0;
	}
out:
	put_net(peer);
	return err;
@@ -538,14 +588,10 @@ static int rtnl_net_get_size(void)
}

static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
			 int cmd, struct net *net, struct net *peer,
			 int nsid)
			 int cmd, struct net *net, int nsid)
{
	struct nlmsghdr *nlh;
	struct rtgenmsg *rth;
	int id;

	ASSERT_RTNL();

	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
	if (!nlh)
@@ -554,14 +600,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
	rth = nlmsg_data(nlh);
	rth->rtgen_family = AF_UNSPEC;

	if (nsid >= 0) {
		id = nsid;
	} else {
		id = __peernet2id(net, peer, false);
		if  (id < 0)
			id = NETNSA_NSID_NOT_ASSIGNED;
	}
	if (nla_put_s32(skb, NETNSA_NSID, id))
	if (nla_put_s32(skb, NETNSA_NSID, nsid))
		goto nla_put_failure;

	nlmsg_end(skb, nlh);
@@ -578,7 +617,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
	struct nlattr *tb[NETNSA_MAX + 1];
	struct sk_buff *msg;
	struct net *peer;
	int err;
	int err, id;

	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
			  rtnl_net_policy);
@@ -600,8 +639,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
		goto out;
	}

	id = peernet2id(net, peer);
	err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
			    RTM_GETNSID, net, peer, -1);
			    RTM_GETNSID, net, id);
	if (err < 0)
		goto err_out;

@@ -633,7 +673,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)

	ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
			    net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
			    RTM_NEWNSID, net_cb->net, peer, id);
			    RTM_NEWNSID, net_cb->net, id);
	if (ret < 0)
		return ret;

@@ -652,17 +692,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
		.idx = 0,
		.s_idx = cb->args[0],
	};
	unsigned long flags;

	ASSERT_RTNL();

	spin_lock_irqsave(&nsid_lock, flags);
	idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
	spin_unlock_irqrestore(&nsid_lock, flags);

	cb->args[0] = net_cb.idx;
	return skb->len;
}

static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
			      int id)
static void rtnl_net_notifyid(struct net *net, int cmd, int id)
{
	struct sk_buff *msg;
	int err = -ENOMEM;
@@ -671,7 +711,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
	if (!msg)
		goto out;

	err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id);
	err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
	if (err < 0)
		goto err_out;

Loading