Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 85670cc1 authored by Patrick McHardy's avatar Patrick McHardy Committed by David S. Miller
Browse files

[NET_SCHED]: Fix fallout from dev->qdisc RCU change



The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.

The two assumptions were:

- since changes only happen in process context, read_lock doesn't need
  bottem half protection. Now invalid since destruction of inner qdiscs,
  classifiers, actions and estimators happens in the RCU callback unless
  they're manually deleted, resulting in dead-locks when read_lock in
  process context is interrupted by write_lock_bh in bottem half context.

- since changes only happen under the RTNL, no additional locking is
  necessary for data not used during packet processing (f.e. u32_list).
  Again, since destruction now happens in the RCU callback, this assumption
  is not valid anymore, causing races while using this data, which can
  result in corruption or use-after-free.

Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.

Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 787e0617
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -1480,15 +1480,17 @@ gso:
	if (q->enqueue) {
		/* Grab device queue */
		spin_lock(&dev->queue_lock);

		q = dev->qdisc;
		if (q->enqueue) {
			rc = q->enqueue(skb, q);

			qdisc_run(dev);

			spin_unlock(&dev->queue_lock);

			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
			goto out;
		}
		spin_unlock(&dev->queue_lock);
	}

	/* The device has no queue. Common case for software devices:
	   loopback, all the sorts of tunnels...
+2 −2
Original line number Diff line number Diff line
@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
		return skb->len;

	read_lock_bh(&qdisc_tree_lock);
	read_lock(&qdisc_tree_lock);
	if (!tcm->tcm_parent)
		q = dev->qdisc_sleeping;
	else
@@ -458,7 +458,7 @@ errout:
	if (cl)
		cops->put(q, cl);
out:
	read_unlock_bh(&qdisc_tree_lock);
	read_unlock(&qdisc_tree_lock);
	dev_put(dev);
	return skb->len;
}
+8 −8
Original line number Diff line number Diff line
@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
{
	struct Qdisc *q;

	read_lock_bh(&qdisc_tree_lock);
	read_lock(&qdisc_tree_lock);
	list_for_each_entry(q, &dev->qdisc_list, list) {
		if (q->handle == handle) {
			read_unlock_bh(&qdisc_tree_lock);
			read_unlock(&qdisc_tree_lock);
			return q;
		}
	}
	read_unlock_bh(&qdisc_tree_lock);
	read_unlock(&qdisc_tree_lock);
	return NULL;
}

@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
			continue;
		if (idx > s_idx)
			s_q_idx = 0;
		read_lock_bh(&qdisc_tree_lock);
		read_lock(&qdisc_tree_lock);
		q_idx = 0;
		list_for_each_entry(q, &dev->qdisc_list, list) {
			if (q_idx < s_q_idx) {
@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
			}
			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
				read_unlock_bh(&qdisc_tree_lock);
				read_unlock(&qdisc_tree_lock);
				goto done;
			}
			q_idx++;
		}
		read_unlock_bh(&qdisc_tree_lock);
		read_unlock(&qdisc_tree_lock);
	}

done:
@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
	s_t = cb->args[0];
	t = 0;

	read_lock_bh(&qdisc_tree_lock);
	read_lock(&qdisc_tree_lock);
	list_for_each_entry(q, &dev->qdisc_list, list) {
		if (t < s_t || !q->ops->cl_ops ||
		    (tcm->tcm_parent &&
@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
			break;
		t++;
	}
	read_unlock_bh(&qdisc_tree_lock);
	read_unlock(&qdisc_tree_lock);

	cb->args[0] = t;

+21 −45
Original line number Diff line number Diff line
@@ -45,11 +45,10 @@
   The idea is the following:
   - enqueue, dequeue are serialized via top level device
     spinlock dev->queue_lock.
   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
   - tree walking is protected by read_lock(qdisc_tree_lock)
     and this lock is used only in process context.
   - updates to tree are made under rtnl semaphore or
     from softirq context (__qdisc_destroy rcu-callback)
     hence this lock needs local bh disabling.
   - updates to tree are made only under rtnl semaphore,
     hence this lock may be made without local bh disabling.

   qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
 */
@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock);

void qdisc_lock_tree(struct net_device *dev)
{
	write_lock_bh(&qdisc_tree_lock);
	write_lock(&qdisc_tree_lock);
	spin_lock_bh(&dev->queue_lock);
}

void qdisc_unlock_tree(struct net_device *dev)
{
	spin_unlock_bh(&dev->queue_lock);
	write_unlock_bh(&qdisc_tree_lock);
	write_unlock(&qdisc_tree_lock);
}

/* 
@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc)
static void __qdisc_destroy(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
	struct Qdisc_ops  *ops = qdisc->ops;

#ifdef CONFIG_NET_ESTIMATOR
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
#endif
	write_lock(&qdisc_tree_lock);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);
	write_unlock(&qdisc_tree_lock);
	module_put(ops->owner);

	dev_put(qdisc->dev);
	kfree((char *) qdisc - qdisc->padded);
}

@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head)

void qdisc_destroy(struct Qdisc *qdisc)
{
	struct list_head cql = LIST_HEAD_INIT(cql);
	struct Qdisc *cq, *q, *n;
	struct Qdisc_ops  *ops = qdisc->ops;

	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

	if (!list_empty(&qdisc->list)) {
		if (qdisc->ops->cl_ops == NULL)
	list_del(&qdisc->list);
		else
			list_move(&qdisc->list, &cql);
	}

	/* unlink inner qdiscs from dev->qdisc_list immediately */
	list_for_each_entry(cq, &cql, list)
		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
				if (q->ops->cl_ops == NULL)
					list_del_init(&q->list);
				else
					list_move_tail(&q->list, &cql);
			}
	list_for_each_entry_safe(cq, n, &cql, list)
		list_del_init(&cq->list);
#ifdef CONFIG_NET_ESTIMATOR
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
#endif
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc->dev);
	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
}

@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev)
				printk(KERN_INFO "%s: activation failed\n", dev->name);
				return;
			}
			write_lock_bh(&qdisc_tree_lock);
			write_lock(&qdisc_tree_lock);
			list_add_tail(&qdisc->list, &dev->qdisc_list);
			write_unlock_bh(&qdisc_tree_lock);
			write_unlock(&qdisc_tree_lock);
		} else {
			qdisc =  &noqueue_qdisc;
		}
		write_lock_bh(&qdisc_tree_lock);
		write_lock(&qdisc_tree_lock);
		dev->qdisc_sleeping = qdisc;
		write_unlock_bh(&qdisc_tree_lock);
		write_unlock(&qdisc_tree_lock);
	}

	if (!netif_carrier_ok(dev))