Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c7f2733e authored by Jozsef Kadlecsik's avatar Jozsef Kadlecsik Committed by Greg Kroah-Hartman
Browse files

netfilter: ipset: fix performance regression in swap operation

commit 97f7cf1cd80eeed3b7c808b7c12463295c751001 upstream.

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/


Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: default avatarAle Crismani <ale.crismani@automattic.com>
Reported-by: default avatarDavid Wang <00107082@163.com>
Tested-by: default avatarDavid Wang <00107082@163.com>
Signed-off-by: default avatarJozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent d04acadb
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -188,6 +188,8 @@ struct ip_set_type_variant {
	/* Return true if "b" set is the same as "a"
	 * according to the create set parameters */
	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
	/* Cancel ongoing garbage collectors before destroying the set*/
	void (*cancel_gc)(struct ip_set *set);
	/* Region-locking is used */
	bool region_lock;
};
@@ -236,6 +238,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);

/* A generic IP set */
struct ip_set {
	/* For call_cru in destroy */
	struct rcu_head rcu;
	/* The name of the set */
	char name[IPSET_MAXNAMELEN];
	/* Lock protecting the set data */
+11 −3
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#define mtype_del		IPSET_TOKEN(MTYPE, _del)
#define mtype_list		IPSET_TOKEN(MTYPE, _list)
#define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype			MTYPE

#define get_ext(set, map, id)	((map)->extensions + ((set)->dsize * (id)))
@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set)
{
	struct mtype *map = set->data;

	if (SET_WITH_TIMEOUT(set))
		del_timer_sync(&map->gc);

	if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
		mtype_ext_cleanup(set);
	ip_set_free(map->members);
@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t)
	add_timer(&map->gc);
}

static void
mtype_cancel_gc(struct ip_set *set)
{
	struct mtype *map = set->data;

	if (SET_WITH_TIMEOUT(set))
		del_timer_sync(&map->gc);
}

static const struct ip_set_type_variant mtype = {
	.kadt	= mtype_kadt,
	.uadt	= mtype_uadt,
@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = {
	.head	= mtype_head,
	.list	= mtype_list,
	.same_set = mtype_same_set,
	.cancel_gc = mtype_cancel_gc,
};

#endif /* __IP_SET_BITMAP_IP_GEN_H */
+28 −9
Original line number Diff line number Diff line
@@ -1034,6 +1034,14 @@ ip_set_destroy_set(struct ip_set *set)
	kfree(set);
}

static void
ip_set_destroy_set_rcu(struct rcu_head *head)
{
	struct ip_set *set = container_of(head, struct ip_set, rcu);

	ip_set_destroy_set(set);
}

static int ip_set_destroy(struct net *net, struct sock *ctnl,
			  struct sk_buff *skb, const struct nlmsghdr *nlh,
			  const struct nlattr * const attr[],
@@ -1047,8 +1055,6 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
	if (unlikely(protocol_min_failed(attr)))
		return -IPSET_ERR_PROTOCOL;

	/* Must wait for flush to be really finished in list:set */
	rcu_barrier();

	/* Commands are serialized and references are
	 * protected by the ip_set_ref_lock.
@@ -1060,8 +1066,10 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
	 * counter, so if it's already zero, we can proceed
	 * without holding the lock.
	 */
	read_lock_bh(&ip_set_ref_lock);
	if (!attr[IPSET_ATTR_SETNAME]) {
		/* Must wait for flush to be really finished in list:set */
		rcu_barrier();
		read_lock_bh(&ip_set_ref_lock);
		for (i = 0; i < inst->ip_set_max; i++) {
			s = ip_set(inst, i);
			if (s && (s->ref || s->ref_netlink)) {
@@ -1075,12 +1083,17 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
			s = ip_set(inst, i);
			if (s) {
				ip_set(inst, i) = NULL;
				/* Must cancel garbage collectors */
				s->variant->cancel_gc(s);
				ip_set_destroy_set(s);
			}
		}
		/* Modified by ip_set_destroy() only, which is serialized */
		inst->is_destroyed = false;
	} else {
		u16 features = 0;

		read_lock_bh(&ip_set_ref_lock);
		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
				    &i);
		if (!s) {
@@ -1090,10 +1103,16 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
			ret = -IPSET_ERR_BUSY;
			goto out;
		}
		features = s->type->features;
		ip_set(inst, i) = NULL;
		read_unlock_bh(&ip_set_ref_lock);

		ip_set_destroy_set(s);
		if (features & IPSET_TYPE_NAME) {
			/* Must wait for flush to be really finished  */
			rcu_barrier();
		}
		/* Must cancel garbage collectors */
		s->variant->cancel_gc(s);
		call_rcu(&s->rcu, ip_set_destroy_set_rcu);
	}
	return 0;
out:
@@ -1252,9 +1271,6 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
	ip_set(inst, to_id) = from;
	write_unlock_bh(&ip_set_ref_lock);

	/* Make sure all readers of the old set pointers are completed. */
	synchronize_rcu();

	return 0;
}

@@ -2267,8 +2283,11 @@ ip_set_fini(void)
{
	nf_unregister_sockopt(&so_set);
	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);

	unregister_pernet_subsys(&ip_set_net_ops);

	/* Wait for call_rcu() in destroy */
	rcu_barrier();

	pr_debug("these are the famous last words\n");
}

+12 −3
Original line number Diff line number Diff line
@@ -235,6 +235,7 @@ htable_size(u8 hbits)
#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match

@@ -279,6 +280,7 @@ htable_size(u8 hbits)
#define mtype_gc_do		IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match)

@@ -464,9 +466,6 @@ mtype_destroy(struct ip_set *set)
	struct htype *h = set->data;
	struct list_head *l, *lt;

	if (SET_WITH_TIMEOUT(set))
		cancel_delayed_work_sync(&h->gc.dwork);

	mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
	list_for_each_safe(l, lt, &h->ad) {
		list_del(l);
@@ -613,6 +612,15 @@ mtype_gc_init(struct htable_gc *gc)
	queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}

static void
mtype_cancel_gc(struct ip_set *set)
{
	struct htype *h = set->data;

	if (SET_WITH_TIMEOUT(set))
		cancel_delayed_work_sync(&h->gc.dwork);
}

static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
	  struct ip_set_ext *mext, u32 flags);
@@ -1433,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = {
	.uref	= mtype_uref,
	.resize	= mtype_resize,
	.same_set = mtype_same_set,
	.cancel_gc = mtype_cancel_gc,
	.region_lock = true,
};

+10 −3
Original line number Diff line number Diff line
@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
	struct list_set *map = set->data;
	struct set_elem *e, *n;

	if (SET_WITH_TIMEOUT(set))
		del_timer_sync(&map->gc);

	list_for_each_entry_safe(e, n, &map->members, list) {
		list_del(&e->list);
		ip_set_put_byindex(map->net, e->id);
@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
	       a->extensions == b->extensions;
}

static void
list_set_cancel_gc(struct ip_set *set)
{
	struct list_set *map = set->data;

	if (SET_WITH_TIMEOUT(set))
		del_timer_sync(&map->gc);
}

static const struct ip_set_type_variant set_variant = {
	.kadt	= list_set_kadt,
	.uadt	= list_set_uadt,
@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
	.head	= list_set_head,
	.list	= list_set_list,
	.same_set = list_set_same_set,
	.cancel_gc = list_set_cancel_gc,
};

static void