Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 41e8b206 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'rhashtable-next'



Thomas Graf says:

====================
rhashtable fixes

This series fixes all remaining known issues with rhashtable that
have been reported. In particular the race condition reported by
Ying Xue.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2ca292d9 cf52d52f
Loading
Loading
Loading
Loading
+167 −138
Original line number Diff line number Diff line
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Based on the following paper:
@@ -34,12 +34,17 @@
enum {
	RHT_LOCK_NORMAL,
	RHT_LOCK_NESTED,
	RHT_LOCK_NESTED2,
};

/* The bucket lock is selected based on the hash and protects mutations
 * on a group of hash buckets.
 *
 * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
 * a single lock always covers both buckets which may both contains
 * entries which link to the same bucket of the old table during resizing.
 * This allows to simplify the locking as locking the bucket in both
 * tables during resize always guarantee protection.
 *
 * IMPORTANT: When holding the bucket lock of both the old and new table
 * during expansions and shrinking, the old bucket lock must always be
 * acquired first.
@@ -49,26 +54,6 @@ static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
	return &tbl->locks[hash & tbl->locks_mask];
}

#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
#define ASSERT_BUCKET_LOCK(TBL, HASH) \
	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
	spinlock_t *lock = bucket_lock(tbl, hash);

	return (debug_locks) ? lockdep_is_held(lock) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#endif

static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
{
	return (void *) he - ht->p.head_offset;
@@ -94,13 +79,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)

static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
{
	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
	u32 hash;

	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
	hash >>= HASH_RESERVED_SPACE;

	return rht_bucket_index(tbl, hash);
	return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
}

static u32 head_hashfn(const struct rhashtable *ht,
@@ -110,6 +89,77 @@ static u32 head_hashfn(const struct rhashtable *ht,
	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
}

#ifdef CONFIG_PROVE_LOCKING
static void debug_dump_buckets(const struct rhashtable *ht,
			       const struct bucket_table *tbl)
{
	struct rhash_head *he;
	unsigned int i, hash;

	for (i = 0; i < tbl->size; i++) {
		pr_warn(" [Bucket %d] ", i);
		rht_for_each_rcu(he, tbl, i) {
			hash = head_hashfn(ht, tbl, he);
			pr_cont("[hash = %#x, lock = %p] ",
				hash, bucket_lock(tbl, hash));
		}
		pr_cont("\n");
	}

}

static void debug_dump_table(struct rhashtable *ht,
			     const struct bucket_table *tbl,
			     unsigned int hash)
{
	struct bucket_table *old_tbl, *future_tbl;

	pr_emerg("BUG: lock for hash %#x in table %p not held\n",
		 hash, tbl);

	rcu_read_lock();
	future_tbl = rht_dereference_rcu(ht->future_tbl, ht);
	old_tbl = rht_dereference_rcu(ht->tbl, ht);
	if (future_tbl != old_tbl) {
		pr_warn("Future table %p (size: %zd)\n",
			future_tbl, future_tbl->size);
		debug_dump_buckets(ht, future_tbl);
	}

	pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size);
	debug_dump_buckets(ht, old_tbl);

	rcu_read_unlock();
}

#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)				\
	do {								\
		if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) {	\
			debug_dump_table(HT, TBL, HASH);		\
			BUG();						\
		}							\
	} while (0)

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
	spinlock_t *lock = bucket_lock(tbl, hash);

	return (debug_locks) ? lockdep_is_held(lock) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)
#endif


static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
{
	struct rhash_head __rcu **pprev;
@@ -134,8 +184,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);

	/* Never allocate more than one lock per bucket */
	size = min_t(unsigned int, size, tbl->size);
	/* Never allocate more than 0.5 locks per bucket */
	size = min_t(unsigned int, size, tbl->size >> 1);

	if (sizeof(spinlock_t) != 0) {
#ifdef CONFIG_NUMA
@@ -217,25 +267,48 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
}
EXPORT_SYMBOL_GPL(rht_shrink_below_30);

static void hashtable_chain_unzip(const struct rhashtable *ht,
static void lock_buckets(struct bucket_table *new_tbl,
			 struct bucket_table *old_tbl, unsigned int hash)
	__acquires(old_bucket_lock)
{
	spin_lock_bh(bucket_lock(old_tbl, hash));
	if (new_tbl != old_tbl)
		spin_lock_bh_nested(bucket_lock(new_tbl, hash),
				    RHT_LOCK_NESTED);
}

static void unlock_buckets(struct bucket_table *new_tbl,
			   struct bucket_table *old_tbl, unsigned int hash)
	__releases(old_bucket_lock)
{
	if (new_tbl != old_tbl)
		spin_unlock_bh(bucket_lock(new_tbl, hash));
	spin_unlock_bh(bucket_lock(old_tbl, hash));
}

/**
 * Unlink entries on bucket which hash to different bucket.
 *
 * Returns true if no more work needs to be performed on the bucket.
 */
static bool hashtable_chain_unzip(struct rhashtable *ht,
				  const struct bucket_table *new_tbl,
				  struct bucket_table *old_tbl,
				  size_t old_hash)
{
	struct rhash_head *he, *p, *next;
	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
	unsigned int new_hash, new_hash2;

	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
	ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash);

	/* Old bucket empty, no work needed. */
	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
				   old_hash);
	if (rht_is_a_nulls(p))
		return;
		return false;

	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
	new_bucket_lock = bucket_lock(new_tbl, new_hash);
	new_hash = head_hashfn(ht, new_tbl, p);
	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);

	/* Advance the old bucket pointer one or more times until it
	 * reaches a node that doesn't hash to the same bucket as the
@@ -243,22 +316,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
	 */
	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
		new_hash2 = head_hashfn(ht, new_tbl, he);
		ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2);

		if (new_hash != new_hash2)
			break;
		p = he;
	}
	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);

	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);

	/* If we have encountered an entry that maps to a different bucket in
	 * the new table, lock down that bucket as well as we might cut off
	 * the end of the chain.
	 */
	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
	if (new_bucket_lock != new_bucket_lock2)
		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);

	/* Find the subsequent node which does hash to the same
	 * bucket as node P, or NULL if no such node exists.
	 */
@@ -277,21 +342,18 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
	 */
	rcu_assign_pointer(p->next, next);

	if (new_bucket_lock != new_bucket_lock2)
		spin_unlock_bh(new_bucket_lock2);
	spin_unlock_bh(new_bucket_lock);
	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
				   old_hash);

	return !rht_is_a_nulls(p);
}

static void link_old_to_new(struct bucket_table *new_tbl,
static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
			    unsigned int new_hash, struct rhash_head *entry)
{
	spinlock_t *new_bucket_lock;

	new_bucket_lock = bucket_lock(new_tbl, new_hash);
	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);

	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
	spin_unlock_bh(new_bucket_lock);
}

/**
@@ -314,7 +376,6 @@ int rhashtable_expand(struct rhashtable *ht)
{
	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
	struct rhash_head *he;
	spinlock_t *old_bucket_lock;
	unsigned int new_hash, old_hash;
	bool complete = false;

@@ -344,24 +405,16 @@ int rhashtable_expand(struct rhashtable *ht)
	 */
	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
		old_hash = rht_bucket_index(old_tbl, new_hash);
		old_bucket_lock = bucket_lock(old_tbl, old_hash);

		spin_lock_bh(old_bucket_lock);
		lock_buckets(new_tbl, old_tbl, new_hash);
		rht_for_each(he, old_tbl, old_hash) {
			if (head_hashfn(ht, new_tbl, he) == new_hash) {
				link_old_to_new(new_tbl, new_hash, he);
				link_old_to_new(ht, new_tbl, new_hash, he);
				break;
			}
		}
		spin_unlock_bh(old_bucket_lock);
		unlock_buckets(new_tbl, old_tbl, new_hash);
	}

	/* Publish the new table pointer. Lookups may now traverse
	 * the new table, but they will not benefit from any
	 * additional efficiency until later steps unzip the buckets.
	 */
	rcu_assign_pointer(ht->tbl, new_tbl);

	/* Unzip interleaved hash chains */
	while (!complete && !ht->being_destroyed) {
		/* Wait for readers. All new readers will see the new
@@ -376,21 +429,19 @@ int rhashtable_expand(struct rhashtable *ht)
		 */
		complete = true;
		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
			struct rhash_head *head;

			old_bucket_lock = bucket_lock(old_tbl, old_hash);
			spin_lock_bh(old_bucket_lock);
			lock_buckets(new_tbl, old_tbl, old_hash);

			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
			head = rht_dereference_bucket(old_tbl->buckets[old_hash],
						      old_tbl, old_hash);
			if (!rht_is_a_nulls(head))
			if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
						  old_hash))
				complete = false;

			spin_unlock_bh(old_bucket_lock);
			unlock_buckets(new_tbl, old_tbl, old_hash);
		}
	}

	rcu_assign_pointer(ht->tbl, new_tbl);
	synchronize_rcu();

	bucket_table_free(old_tbl);
	return 0;
}
@@ -415,7 +466,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
int rhashtable_shrink(struct rhashtable *ht)
{
	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
	unsigned int new_hash;

	ASSERT_RHT_MUTEX(ht);
@@ -433,36 +483,17 @@ int rhashtable_shrink(struct rhashtable *ht)
	 * always divide the size in half when shrinking, each bucket
	 * in the new table maps to exactly two buckets in the old
	 * table.
	 *
	 * As removals can occur concurrently on the old table, we need
	 * to lock down both matching buckets in the old table.
	 */
	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
		old_bucket_lock1 = bucket_lock(tbl, new_hash);
		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
		new_bucket_lock = bucket_lock(new_tbl, new_hash);

		spin_lock_bh(old_bucket_lock1);

		/* Depending on the lock per buckets mapping, the bucket in
		 * the lower and upper region may map to the same lock.
		 */
		if (old_bucket_lock1 != old_bucket_lock2) {
			spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
		} else {
			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
		}
		lock_buckets(new_tbl, tbl, new_hash);

		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
				   tbl->buckets[new_hash]);
		ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size);
		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
				   tbl->buckets[new_hash + new_tbl->size]);

		spin_unlock_bh(new_bucket_lock);
		if (old_bucket_lock1 != old_bucket_lock2)
			spin_unlock_bh(old_bucket_lock2);
		spin_unlock_bh(old_bucket_lock1);
		unlock_buckets(new_tbl, tbl, new_hash);
	}

	/* Publish the new, valid hash table */
@@ -524,6 +555,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
							 tbl, hash);

	ASSERT_BUCKET_LOCK(ht, tbl, hash);

	if (rht_is_a_nulls(head))
		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
	else
@@ -553,19 +586,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 */
void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
{
	struct bucket_table *tbl;
	spinlock_t *lock;
	struct bucket_table *tbl, *old_tbl;
	unsigned hash;

	rcu_read_lock();

	tbl = rht_dereference_rcu(ht->future_tbl, ht);
	old_tbl = rht_dereference_rcu(ht->tbl, ht);
	hash = head_hashfn(ht, tbl, obj);
	lock = bucket_lock(tbl, hash);

	spin_lock_bh(lock);
	lock_buckets(tbl, old_tbl, hash);
	__rhashtable_insert(ht, obj, tbl, hash);
	spin_unlock_bh(lock);
	unlock_buckets(tbl, old_tbl, hash);

	rcu_read_unlock();
}
@@ -588,21 +620,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
 */
bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
{
	struct bucket_table *tbl;
	struct bucket_table *tbl, *new_tbl, *old_tbl;
	struct rhash_head __rcu **pprev;
	struct rhash_head *he;
	spinlock_t *lock;
	unsigned int hash;
	struct rhash_head *he, *he2;
	unsigned int hash, new_hash;
	bool ret = false;

	rcu_read_lock();
	tbl = rht_dereference_rcu(ht->tbl, ht);
	hash = head_hashfn(ht, tbl, obj);

	lock = bucket_lock(tbl, hash);
	spin_lock_bh(lock);
	tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));

	lock_buckets(new_tbl, old_tbl, new_hash);
restart:
	hash = rht_bucket_index(tbl, new_hash);
	pprev = &tbl->buckets[hash];
	rht_for_each(he, tbl, hash) {
		if (he != obj) {
@@ -610,8 +641,22 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
			continue;
		}

		ASSERT_BUCKET_LOCK(ht, tbl, hash);

		if (unlikely(new_tbl != tbl)) {
			rht_for_each_continue(he2, he->next, tbl, hash) {
				if (head_hashfn(ht, tbl, he2) == hash) {
					rcu_assign_pointer(*pprev, he2);
					goto found;
				}
			}

			INIT_RHT_NULLS_HEAD(*pprev, ht, hash);
		} else {
			rcu_assign_pointer(*pprev, obj->next);
		}

found:
		ret = true;
		break;
	}
@@ -621,18 +666,12 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
	 * resizing. Thus traversing both is fine and the added cost is
	 * very rare.
	 */
	if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) {
		spin_unlock_bh(lock);

		tbl = rht_dereference_rcu(ht->future_tbl, ht);
		hash = head_hashfn(ht, tbl, obj);

		lock = bucket_lock(tbl, hash);
		spin_lock_bh(lock);
	if (tbl != new_tbl) {
		tbl = new_tbl;
		goto restart;
	}

	spin_unlock_bh(lock);
	unlock_buckets(new_tbl, old_tbl, new_hash);

	if (ret) {
		atomic_dec(&ht->nelems);
@@ -788,24 +827,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
				      void *arg)
{
	struct bucket_table *new_tbl, *old_tbl;
	spinlock_t *new_bucket_lock, *old_bucket_lock;
	u32 new_hash, old_hash;
	u32 new_hash;
	bool success = true;

	BUG_ON(!ht->p.key_len);

	rcu_read_lock();

	old_tbl = rht_dereference_rcu(ht->tbl, ht);
	old_hash = head_hashfn(ht, old_tbl, obj);
	old_bucket_lock = bucket_lock(old_tbl, old_hash);
	spin_lock_bh(old_bucket_lock);

	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
	new_hash = head_hashfn(ht, new_tbl, obj);
	new_bucket_lock = bucket_lock(new_tbl, new_hash);
	if (unlikely(old_tbl != new_tbl))
		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);

	lock_buckets(new_tbl, old_tbl, new_hash);

	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
				      compare, arg)) {
@@ -816,10 +848,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
	__rhashtable_insert(ht, obj, new_tbl, new_hash);

exit:
	if (unlikely(old_tbl != new_tbl))
		spin_unlock_bh(new_bucket_lock);
	spin_unlock_bh(old_bucket_lock);

	unlock_buckets(new_tbl, old_tbl, new_hash);
	rcu_read_unlock();

	return success;