Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6c325f4e authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net_sched-fix-races-with-RCU-callbacks'

Cong Wang says:

====================
net_sched: fix races with RCU callbacks

Recently, the RCU callbacks used in TC filters and TC actions keep
drawing my attention, they introduce at least 4 race condition bugs:

1. A simple one fixed by Daniel:

commit c78e1746
Author: Daniel Borkmann <daniel@iogearbox.net>
Date:   Wed May 20 17:13:33 2015 +0200

    net: sched: fix call_rcu() race on classifier module unloads

2. A very nasty one fixed by me:

commit 1697c4bb
Author: Cong Wang <xiyou.wangcong@gmail.com>
Date:   Mon Sep 11 16:33:32 2017 -0700

    net_sched: carefully handle tcf_block_put()

3. Two more bugs found by Chris:
https://patchwork.ozlabs.org/patch/826696/
https://patchwork.ozlabs.org/patch/826695/



Usually RCU callbacks are simple, however for TC filters and actions,
they are complex because at least TC actions could be destroyed
together with the TC filter in one callback. And RCU callbacks are
invoked in BH context, without locking they are parallel too. All of
these contribute to the cause of these nasty bugs.

Alternatively, we could also:

a) Introduce a spinlock to serialize these RCU callbacks. But as I
said in commit 1697c4bb ("net_sched: carefully handle
tcf_block_put()"), it is very hard to do because of tcf_chain_dump().
Potentially we need to do a lot of work to make it possible (if not
impossible).

b) Just get rid of these RCU callbacks, because they are not
necessary at all, callers of these call_rcu() are all on slow paths
and holding RTNL lock, so blocking is allowed in their contexts.
However, David and Eric dislike adding synchronize_rcu() here.

As suggested by Paul, we could defer the work to a workqueue and
gain the permission of holding RTNL again without any performance
impact, however, in tcf_block_put() we could have a deadlock when
flushing workqueue while hodling RTNL lock, the trick here is to
defer the work itself in workqueue and make it queued after all
other works so that we keep the same ordering to avoid any
use-after-free. Please see the first patch for details.

Patch 1 introduces the infrastructure, patch 2~12 move each
tc filter to the new tc filter workqueue, patch 13 adds
an assertion to catch potential bugs like this, patch 14
closes another rcu callback race, patch 15 and patch 16 add
new test cases.
====================

Reported-by: default avatarChris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: default avatarCong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8c83c885 31c2611b
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
#define __NET_PKT_CLS_H

#include <linux/pkt_cls.h>
#include <linux/workqueue.h>
#include <net/sch_generic.h>
#include <net/act_api.h>

@@ -17,6 +18,8 @@ struct tcf_walker {
int register_tcf_proto_ops(struct tcf_proto_ops *ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);

bool tcf_queue_work(struct work_struct *work);

#ifdef CONFIG_NET_CLS
struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
				bool create);
+2 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>

@@ -271,6 +272,7 @@ struct tcf_chain {

struct tcf_block {
	struct list_head chain_list;
	struct work_struct work;
};

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
+1 −0
Original line number Diff line number Diff line
@@ -264,6 +264,7 @@ static int __init sample_init_module(void)

static void __exit sample_cleanup_module(void)
{
	rcu_barrier();
	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
}

+52 −17
Original line number Diff line number Diff line
@@ -77,6 +77,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops)
}
EXPORT_SYMBOL(register_tcf_proto_ops);

static struct workqueue_struct *tc_filter_wq;

int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
	struct tcf_proto_ops *t;
@@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
	 * tcf_proto_ops's destroy() handler.
	 */
	rcu_barrier();
	flush_workqueue(tc_filter_wq);

	write_lock(&cls_mod_lock);
	list_for_each_entry(t, &tcf_proto_base, head) {
@@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);

bool tcf_queue_work(struct work_struct *work)
{
	return queue_work(tc_filter_wq, work);
}
EXPORT_SYMBOL(tcf_queue_work);

/* Select new prio value from the range, managed by kernel. */

static inline u32 tcf_auto_prio(struct tcf_proto *tp)
@@ -266,23 +275,30 @@ int tcf_block_get(struct tcf_block **p_block,
}
EXPORT_SYMBOL(tcf_block_get);

void tcf_block_put(struct tcf_block *block)
static void tcf_block_put_final(struct work_struct *work)
{
	struct tcf_block *block = container_of(work, struct tcf_block, work);
	struct tcf_chain *chain, *tmp;

	if (!block)
		return;
	/* At this point, all the chains should have refcnt == 1. */
	rtnl_lock();
	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
		tcf_chain_put(chain);
	rtnl_unlock();
	kfree(block);
}

	/* XXX: Standalone actions are not allowed to jump to any chain, and
	 * bound actions should be all removed after flushing. However,
	 * filters are destroyed in RCU callbacks, we have to hold the chains
	 * first, otherwise we would always race with RCU callbacks on this list
	 * without proper locking.
/* XXX: Standalone actions are not allowed to jump to any chain, and bound
 * actions should be all removed after flushing. However, filters are destroyed
 * in RCU callbacks, we have to hold the chains first, otherwise we would
 * always race with RCU callbacks on this list without proper locking.
 */
static void tcf_block_put_deferred(struct work_struct *work)
{
	struct tcf_block *block = container_of(work, struct tcf_block, work);
	struct tcf_chain *chain;

	/* Wait for existing RCU callbacks to cool down. */
	rcu_barrier();

	rtnl_lock();
	/* Hold a refcnt for all chains, except 0, in case they are gone. */
	list_for_each_entry(chain, &block->chain_list, list)
		if (chain->index)
@@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block)
	list_for_each_entry(chain, &block->chain_list, list)
		tcf_chain_flush(chain);

	/* Wait for RCU callbacks to release the reference count. */
	INIT_WORK(&block->work, tcf_block_put_final);
	/* Wait for RCU callbacks to release the reference count and make
	 * sure their works have been queued before this.
	 */
	rcu_barrier();
	tcf_queue_work(&block->work);
	rtnl_unlock();
}

	/* At this point, all the chains should have refcnt == 1. */
	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
		tcf_chain_put(chain);
	kfree(block);
void tcf_block_put(struct tcf_block *block)
{
	if (!block)
		return;

	INIT_WORK(&block->work, tcf_block_put_deferred);
	/* Wait for existing RCU callbacks to cool down, make sure their works
	 * have been queued before this. We can not flush pending works here
	 * because we are holding the RTNL lock.
	 */
	rcu_barrier();
	tcf_queue_work(&block->work);
}
EXPORT_SYMBOL(tcf_block_put);

@@ -879,6 +909,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
#ifdef CONFIG_NET_CLS_ACT
	LIST_HEAD(actions);

	ASSERT_RTNL();
	tcf_exts_to_list(exts, &actions);
	tcf_action_destroy(&actions, TCA_ACT_UNBIND);
	kfree(exts->actions);
@@ -1030,6 +1061,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev);

static int __init tc_filter_init(void)
{
	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
	if (!tc_filter_wq)
		return -ENOMEM;

	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+17 −3
Original line number Diff line number Diff line
@@ -34,8 +34,11 @@ struct basic_filter {
	struct tcf_result	res;
	struct tcf_proto	*tp;
	struct list_head	link;
	union {
		struct work_struct	work;
		struct rcu_head		rcu;
	};
};

static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
			  struct tcf_result *res)
@@ -82,15 +85,26 @@ static int basic_init(struct tcf_proto *tp)
	return 0;
}

static void basic_delete_filter(struct rcu_head *head)
static void basic_delete_filter_work(struct work_struct *work)
{
	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
	struct basic_filter *f = container_of(work, struct basic_filter, work);

	rtnl_lock();
	tcf_exts_destroy(&f->exts);
	tcf_em_tree_destroy(&f->ematches);
	rtnl_unlock();

	kfree(f);
}

static void basic_delete_filter(struct rcu_head *head)
{
	struct basic_filter *f = container_of(head, struct basic_filter, rcu);

	INIT_WORK(&f->work, basic_delete_filter_work);
	tcf_queue_work(&f->work);
}

static void basic_destroy(struct tcf_proto *tp)
{
	struct basic_head *head = rtnl_dereference(tp->root);
Loading