Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 148deab2 authored by Matthew Wilcox's avatar Matthew Wilcox Committed by Linus Torvalds
Browse files

radix-tree: improve multiorder iterators

This fixes several interlinked problems with the iterators in the
presence of multiorder entries.

1. radix_tree_iter_next() would only advance by one slot, which would
   result in the iterators returning the same entry more than once if
   there were sibling entries.

2. radix_tree_next_slot() could return an internal pointer instead of
   a user pointer if a tagged multiorder entry was immediately followed by
   an entry of lower order.

3. radix_tree_next_slot() expanded to a lot more code than it used to
   when multiorder support was compiled in.  And I wasn't comfortable with
   entry_to_node() being in a header file.

Fixing radix_tree_iter_next() for the presence of sibling entries
necessarily involves examining the contents of the radix tree, so we now
need to pass 'slot' to radix_tree_iter_next(), and we need to change the
calling convention so it is called *before* dropping the lock which
protects the tree.  Also rename it to radix_tree_iter_resume(), as some
people thought it was necessary to call radix_tree_iter_next() each time
around the loop.

radix_tree_next_slot() becomes closer to how it looked before multiorder
support was introduced.  It only checks to see if the next entry in the
chunk is a sibling entry or a pointer to a node; this should be rare
enough that handling this case out of line is not a performance impact
(and such impact is amortised by the fact that the entry we just
processed was a multiorder entry).  Also, radix_tree_next_slot() used to
force a new chunk lookup for untagged entries, which is more expensive
than the out of line sibling entry skipping.

Link: http://lkml.kernel.org/r/1480369871-5271-55-git-send-email-mawilcox@linuxonhyperv.com


Signed-off-by: default avatarMatthew Wilcox <mawilcox@microsoft.com>
Tested-by: default avatarKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b35df27a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
				slot = radix_tree_iter_retry(&iter);
			continue;
		}
		slot = radix_tree_iter_next(&iter);
		slot = radix_tree_iter_resume(slot, &iter);
		spin_unlock(&fs_info->buffer_lock);
		free_extent_buffer_stale(eb);
		spin_lock(&fs_info->buffer_lock);
+30 −41
Original line number Diff line number Diff line
@@ -403,20 +403,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
}

/**
 * radix_tree_iter_next - resume iterating when the chunk may be invalid
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * to continue the iteration from the next index.
 * before releasing the lock to continue the iteration from the next index.
 */
static inline __must_check
void **radix_tree_iter_next(struct radix_tree_iter *iter)
{
	iter->next_index = __radix_tree_iter_add(iter, 1);
	iter->tags = 0;
	return NULL;
}
void **__must_check radix_tree_iter_resume(void **slot,
					struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
@@ -430,10 +427,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
	return (iter->next_index - iter->index) >> iter_shift(iter);
}

static inline struct radix_tree_node *entry_to_node(void *ptr)
#ifdef CONFIG_RADIX_TREE_MULTIORDER
void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
				unsigned flags);
#else
/* Can't happen without sibling entries, but the compiler can't tell that */
static inline void ** __radix_tree_next_slot(void **slot,
				struct radix_tree_iter *iter, unsigned flags)
{
	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
	return slot;
}
#endif

/**
 * radix_tree_next_slot - find next slot in chunk
@@ -447,7 +451,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr)
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_next() or
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
@@ -458,51 +462,31 @@ static __always_inline void **
radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
{
	if (flags & RADIX_TREE_ITER_TAGGED) {
		void *canon = slot;

		iter->tags >>= 1;
		if (unlikely(!iter->tags))
			return NULL;
		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
					radix_tree_is_internal_node(slot[1])) {
			if (entry_to_node(slot[1]) == canon) {
				iter->tags >>= 1;
				iter->index = __radix_tree_iter_add(iter, 1);
				slot++;
				continue;
			}
			iter->next_index = __radix_tree_iter_add(iter, 1);
			return NULL;
		}
		if (likely(iter->tags & 1ul)) {
			iter->index = __radix_tree_iter_add(iter, 1);
			return slot + 1;
			slot++;
			goto found;
		}
		if (!(flags & RADIX_TREE_ITER_CONTIG)) {
			unsigned offset = __ffs(iter->tags);

			iter->tags >>= offset;
			iter->index = __radix_tree_iter_add(iter, offset + 1);
			return slot + offset + 1;
			iter->tags >>= offset++;
			iter->index = __radix_tree_iter_add(iter, offset);
			slot += offset;
			goto found;
		}
	} else {
		long count = radix_tree_chunk_size(iter);
		void *canon = slot;

		while (--count > 0) {
			slot++;
			iter->index = __radix_tree_iter_add(iter, 1);

			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
			    radix_tree_is_internal_node(*slot)) {
				if (entry_to_node(*slot) == canon)
					continue;
				iter->next_index = iter->index;
				break;
			}

			if (likely(*slot))
				return slot;
				goto found;
			if (flags & RADIX_TREE_ITER_CONTIG) {
				/* forbid switching to the next chunk */
				iter->next_index = 0;
@@ -511,6 +495,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
		}
	}
	return NULL;

 found:
	if (unlikely(radix_tree_is_internal_node(*slot)))
		return __radix_tree_next_slot(slot, iter, flags);
	return slot;
}

/**
@@ -561,6 +550,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
	     slot || (slot = radix_tree_next_chunk(root, iter,		\
			      RADIX_TREE_ITER_TAGGED | tag)) ;		\
	     slot = radix_tree_next_slot(slot, iter,			\
				RADIX_TREE_ITER_TAGGED))
				RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */
+121 −17
Original line number Diff line number Diff line
@@ -69,6 +69,11 @@ struct radix_tree_preload {
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -1104,6 +1109,120 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
#endif
}

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
				struct radix_tree_node *node, unsigned offset,
				unsigned tag)
{
	unsigned tag_long = offset / BITS_PER_LONG;
	unsigned tag_bit  = offset % BITS_PER_LONG;

	iter->tags = node->tags[tag][tag_long] >> tag_bit;

	/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
	if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
		/* Pick tags from next element */
		if (tag_bit)
			iter->tags |= node->tags[tag][tag_long + 1] <<
						(BITS_PER_LONG - tag_bit);
		/* Clip chunk size, here only BITS_PER_LONG tags */
		iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
	}
}

#ifdef CONFIG_RADIX_TREE_MULTIORDER
static void **skip_siblings(struct radix_tree_node **nodep,
			void **slot, struct radix_tree_iter *iter)
{
	void *sib = node_to_entry(slot - 1);

	while (iter->index < iter->next_index) {
		*nodep = rcu_dereference_raw(*slot);
		if (*nodep && *nodep != sib)
			return slot;
		slot++;
		iter->index = __radix_tree_iter_add(iter, 1);
		iter->tags >>= 1;
	}

	*nodep = NULL;
	return NULL;
}

void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
					unsigned flags)
{
	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
	struct radix_tree_node *node = rcu_dereference_raw(*slot);

	slot = skip_siblings(&node, slot, iter);

	while (radix_tree_is_internal_node(node)) {
		unsigned offset;
		unsigned long next_index;

		if (node == RADIX_TREE_RETRY)
			return slot;
		node = entry_to_node(node);
		iter->shift = node->shift;

		if (flags & RADIX_TREE_ITER_TAGGED) {
			offset = radix_tree_find_next_bit(node, tag, 0);
			if (offset == RADIX_TREE_MAP_SIZE)
				return NULL;
			slot = &node->slots[offset];
			iter->index = __radix_tree_iter_add(iter, offset);
			set_iter_tags(iter, node, offset, tag);
			node = rcu_dereference_raw(*slot);
		} else {
			offset = 0;
			slot = &node->slots[0];
			for (;;) {
				node = rcu_dereference_raw(*slot);
				if (node)
					break;
				slot++;
				offset++;
				if (offset == RADIX_TREE_MAP_SIZE)
					return NULL;
			}
			iter->index = __radix_tree_iter_add(iter, offset);
		}
		if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
			goto none;
		next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
		if (next_index < iter->next_index)
			iter->next_index = next_index;
	}

	return slot;
 none:
	iter->next_index = 0;
	return NULL;
}
EXPORT_SYMBOL(__radix_tree_next_slot);
#else
static void **skip_siblings(struct radix_tree_node **nodep,
			void **slot, struct radix_tree_iter *iter)
{
	return slot;
}
#endif

void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
{
	struct radix_tree_node *node;

	slot++;
	iter->index = __radix_tree_iter_add(iter, 1);
	node = rcu_dereference_raw(*slot);
	skip_siblings(&node, slot, iter);
	iter->next_index = iter->index;
	iter->tags = 0;
	return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
@@ -1191,23 +1310,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
	iter->next_index = (index | node_maxindex(node)) + 1;
	__set_iter_shift(iter, node->shift);

	/* Construct iter->tags bit-mask from node->tags[tag] array */
	if (flags & RADIX_TREE_ITER_TAGGED) {
		unsigned tag_long, tag_bit;

		tag_long = offset / BITS_PER_LONG;
		tag_bit  = offset % BITS_PER_LONG;
		iter->tags = node->tags[tag][tag_long] >> tag_bit;
		/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
		if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
			/* Pick tags from next element */
			if (tag_bit)
				iter->tags |= node->tags[tag][tag_long + 1] <<
						(BITS_PER_LONG - tag_bit);
			/* Clip chunk size, here only BITS_PER_LONG tags */
			iter->next_index = index + BITS_PER_LONG;
		}
	}
	if (flags & RADIX_TREE_ITER_TAGGED)
		set_iter_tags(iter, node, offset, tag);

	return node->slots + offset;
}
+3 −4
Original line number Diff line number Diff line
@@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm,
		radix_tree_replace_slot(&mapping->page_tree, slot,
				new_page + (index % HPAGE_PMD_NR));

		slot = radix_tree_iter_next(&iter);
		slot = radix_tree_iter_resume(slot, &iter);
		index++;
		continue;
out_lru:
@@ -1546,7 +1546,6 @@ static void collapse_shmem(struct mm_struct *mm,
				/* Put holes back where they were */
				radix_tree_delete(&mapping->page_tree,
						  iter.index);
				slot = radix_tree_iter_next(&iter);
				continue;
			}

@@ -1557,11 +1556,11 @@ static void collapse_shmem(struct mm_struct *mm,
			page_ref_unfreeze(page, 2);
			radix_tree_replace_slot(&mapping->page_tree,
						slot, page);
			slot = radix_tree_iter_resume(slot, &iter);
			spin_unlock_irq(&mapping->tree_lock);
			putback_lru_page(page);
			unlock_page(page);
			spin_lock_irq(&mapping->tree_lock);
			slot = radix_tree_iter_next(&iter);
		}
		VM_BUG_ON(nr_none);
		spin_unlock_irq(&mapping->tree_lock);
@@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
		present++;

		if (need_resched()) {
			slot = radix_tree_iter_resume(slot, &iter);
			cond_resched_rcu();
			slot = radix_tree_iter_next(&iter);
		}
	}
	rcu_read_unlock();
+3 −3
Original line number Diff line number Diff line
@@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
			swapped++;

		if (need_resched()) {
			slot = radix_tree_iter_resume(slot, &iter);
			cond_resched_rcu();
			slot = radix_tree_iter_next(&iter);
		}
	}

@@ -2447,8 +2447,8 @@ static void shmem_tag_pins(struct address_space *mapping)
		}

		if (need_resched()) {
			slot = radix_tree_iter_resume(slot, &iter);
			cond_resched_rcu();
			slot = radix_tree_iter_next(&iter);
		}
	}
	rcu_read_unlock();
@@ -2517,8 +2517,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
			spin_unlock_irq(&mapping->tree_lock);
continue_resched:
			if (need_resched()) {
				slot = radix_tree_iter_resume(slot, &iter);
				cond_resched_rcu();
				slot = radix_tree_iter_next(&iter);
			}
		}
		rcu_read_unlock();
Loading