Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit aa1039e7 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

inetpeer: RCU conversion



inetpeer currently uses an AVL tree protected by an rwlock.

It's possible to make most lookups use RCU

1) Add a struct rcu_head to struct inet_peer

2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().

3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.

4) add an smp_wmb() in link_to_pool() right before node insert.

5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.

6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.

7) inet_getpeer() first attempts lockless lookup.
   Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
   If this attemps fails, lock is taken a regular lookup is performed
again.

8) convert peers.lock from rwlock to a spinlock

9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.

Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 7b34a464
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ struct inet_peer {
	atomic_t		ip_id_count;	/* IP ID for the next packet */
	__u32			tcp_ts;
	__u32			tcp_ts_stamp;
	struct rcu_head		rcu;
};

void			inet_initpeers(void) __init;
+95 −69
Original line number Diff line number Diff line
@@ -51,8 +51,8 @@
 *  lookups performed with disabled BHs.
 *
 *  Serialisation issues.
 *  1.  Nodes may appear in the tree only with the pool write lock held.
 *  2.  Nodes may disappear from the tree only with the pool write lock held
 *  1.  Nodes may appear in the tree only with the pool lock held.
 *  2.  Nodes may disappear from the tree only with the pool lock held
 *      AND reference count being 0.
 *  3.  Nodes appears and disappears from unused node list only under
 *      "inet_peer_unused_lock".
@@ -80,11 +80,11 @@ static const struct inet_peer peer_fake_node = {

static struct {
	struct inet_peer *root;
	rwlock_t	lock;
	spinlock_t	lock;
	int		total;
} peers = {
	.root		= peer_avl_empty,
	.lock		= __RW_LOCK_UNLOCKED(peers.lock),
	.lock		= __SPIN_LOCK_UNLOCKED(peers.lock),
	.total		= 0,
};
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -129,7 +129,7 @@ void __init inet_initpeers(void)

	peer_cachep = kmem_cache_create("inet_peer_cache",
			sizeof(struct inet_peer),
			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
			0, SLAB_PANIC,
			NULL);

	/* All the timers, started at system startup tend
@@ -153,16 +153,13 @@ static void unlink_from_unused(struct inet_peer *p)

/*
 * Called with local BH disabled and the pool lock held.
 * _stack is known to be NULL or not at compile time,
 * so compiler will optimize the if (_stack) tests.
 */
#define lookup(_daddr, _stack) 					\
({								\
	struct inet_peer *u, **v;				\
	if (_stack != NULL) {					\
								\
	stackptr = _stack;					\
	*stackptr++ = &peers.root;				\
	}							\
	for (u = peers.root; u != peer_avl_empty; ) {		\
		if (_daddr == u->v4daddr)			\
			break;					\
@@ -170,14 +167,41 @@ static void unlink_from_unused(struct inet_peer *p)
			v = &u->avl_left;			\
		else						\
			v = &u->avl_right;			\
		if (_stack != NULL)				\
		*stackptr++ = v;				\
		u = *v;						\
	}							\
	u;							\
})

/* Called with local BH disabled and the pool write lock held. */
/*
 * Called with rcu_read_lock_bh()
 * Because we hold no lock against a writer, its quite possible we fall
 * in an endless loop.
 * But every pointer we follow is guaranteed to be valid thanks to RCU.
 * We exit from this function if number of links exceeds PEER_MAXDEPTH
 */
static struct inet_peer *lookup_rcu_bh(__be32 daddr)
{
	struct inet_peer *u = rcu_dereference_bh(peers.root);
	int count = 0;

	while (u != peer_avl_empty) {
		if (daddr == u->v4daddr) {
			if (unlikely(!atomic_inc_not_zero(&u->refcnt)))
				u = NULL;
			return u;
		}
		if ((__force __u32)daddr < (__force __u32)u->v4daddr)
			u = rcu_dereference_bh(u->avl_left);
		else
			u = rcu_dereference_bh(u->avl_right);
		if (unlikely(++count == PEER_MAXDEPTH))
			break;
	}
	return NULL;
}

/* Called with local BH disabled and the pool lock held. */
#define lookup_rightempty(start)				\
({								\
	struct inet_peer *u, **v;				\
@@ -191,9 +215,10 @@ static void unlink_from_unused(struct inet_peer *p)
	u;							\
})

/* Called with local BH disabled and the pool write lock held.
/* Called with local BH disabled and the pool lock held.
 * Variable names are the proof of operation correctness.
 * Look into mm/map_avl.c for more detail description of the ideas.  */
 * Look into mm/map_avl.c for more detail description of the ideas.
 */
static void peer_avl_rebalance(struct inet_peer **stack[],
		struct inet_peer ***stackend)
{
@@ -269,16 +294,22 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
	}
}

/* Called with local BH disabled and the pool write lock held. */
/* Called with local BH disabled and the pool lock held. */
#define link_to_pool(n)						\
do {								\
	n->avl_height = 1;					\
	n->avl_left = peer_avl_empty;				\
	n->avl_right = peer_avl_empty;				\
	smp_wmb(); /* lockless readers can catch us now */	\
	**--stackptr = n;					\
	peer_avl_rebalance(stack, stackptr);			\
} while (0)

static void inetpeer_free_rcu(struct rcu_head *head)
{
	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}

/* May be called with local BH enabled. */
static void unlink_from_pool(struct inet_peer *p)
{
@@ -286,13 +317,13 @@ static void unlink_from_pool(struct inet_peer *p)

	do_free = 0;

	write_lock_bh(&peers.lock);
	spin_lock_bh(&peers.lock);
	/* Check the reference counter.  It was artificially incremented by 1
	 * in cleanup() function to prevent sudden disappearing.  If the
	 * reference count is still 1 then the node is referenced only as `p'
	 * here and from the pool.  So under the exclusive pool lock it's safe
	 * to remove the node and free it later. */
	if (atomic_read(&p->refcnt) == 1) {
	 * in cleanup() function to prevent sudden disappearing.  If we can
	 * atomically (because of lockless readers) take this last reference,
	 * it's safe to remove the node and free it later.
	 */
	if (atomic_cmpxchg(&p->refcnt, 1, 0) == 1) {
		struct inet_peer **stack[PEER_MAXDEPTH];
		struct inet_peer ***stackptr, ***delp;
		if (lookup(p->v4daddr, stack) != p)
@@ -321,17 +352,18 @@ static void unlink_from_pool(struct inet_peer *p)
		peers.total--;
		do_free = 1;
	}
	write_unlock_bh(&peers.lock);
	spin_unlock_bh(&peers.lock);

	if (do_free)
		kmem_cache_free(peer_cachep, p);
		call_rcu_bh(&p->rcu, inetpeer_free_rcu);
	else
		/* The node is used again.  Decrease the reference counter
		 * back.  The loop "cleanup -> unlink_from_unused
		 *   -> unlink_from_pool -> putpeer -> link_to_unused
		 *   -> cleanup (for the same node)"
		 * doesn't really exist because the entry will have a
		 * recent deletion time and will not be cleaned again soon. */
		 * recent deletion time and will not be cleaned again soon.
		 */
		inet_putpeer(p);
}

@@ -375,62 +407,56 @@ static int cleanup_once(unsigned long ttl)
/* Called with or without local BH being disabled. */
struct inet_peer *inet_getpeer(__be32 daddr, int create)
{
	struct inet_peer *p, *n;
	struct inet_peer *p;
	struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;

	/* Look up for the address quickly. */
	read_lock_bh(&peers.lock);
	p = lookup(daddr, NULL);
	if (p != peer_avl_empty)
		atomic_inc(&p->refcnt);
	read_unlock_bh(&peers.lock);
	/* Look up for the address quickly, lockless.
	 * Because of a concurrent writer, we might not find an existing entry.
	 */
	rcu_read_lock_bh();
	p = lookup_rcu_bh(daddr);
	rcu_read_unlock_bh();

	if (p) {
		/* The existing node has been found.
		 * Remove the entry from unused list if it was there.
		 */
		unlink_from_unused(p);
		return p;
	}

	/* retry an exact lookup, taking the lock before.
	 * At least, nodes should be hot in our cache.
	 */
	spin_lock_bh(&peers.lock);
	p = lookup(daddr, stack);
	if (p != peer_avl_empty) {
		/* The existing node has been found. */
		atomic_inc(&p->refcnt);
		spin_unlock_bh(&peers.lock);
		/* Remove the entry from unused list if it was there. */
		unlink_from_unused(p);
		return p;
	}
	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
	if (p) {
		p->v4daddr = daddr;
		atomic_set(&p->refcnt, 1);
		atomic_set(&p->rid, 0);
		atomic_set(&p->ip_id_count, secure_ip_id(daddr));
		p->tcp_ts_stamp = 0;
		INIT_LIST_HEAD(&p->unused);

	if (!create)
		return NULL;

	/* Allocate the space outside the locked region. */
	n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
	if (n == NULL)
		return NULL;
	n->v4daddr = daddr;
	atomic_set(&n->refcnt, 1);
	atomic_set(&n->rid, 0);
	atomic_set(&n->ip_id_count, secure_ip_id(daddr));
	n->tcp_ts_stamp = 0;

	write_lock_bh(&peers.lock);
	/* Check if an entry has suddenly appeared. */
	p = lookup(daddr, stack);
	if (p != peer_avl_empty)
		goto out_free;

		/* Link the node. */
	link_to_pool(n);
	INIT_LIST_HEAD(&n->unused);
		link_to_pool(p);
		peers.total++;
	write_unlock_bh(&peers.lock);
	}
	spin_unlock_bh(&peers.lock);

	if (peers.total >= inet_peer_threshold)
		/* Remove one less-recently-used entry. */
		cleanup_once(0);

	return n;

out_free:
	/* The appropriate node is already in the pool. */
	atomic_inc(&p->refcnt);
	write_unlock_bh(&peers.lock);
	/* Remove the entry from unused list if it was there. */
	unlink_from_unused(p);
	/* Free preallocated the preallocated node. */
	kmem_cache_free(peer_cachep, n);
	return p;
}