Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a2c16d6c authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds
Browse files

mm: let swap use exceptional entries



If swap entries are to be stored along with struct page pointers in a
radix tree, they need to be distinguished as exceptional entries.

Most of the handling of swap entries in radix tree will be contained in
shmem.c, but a few functions in filemap.c's common code need to check
for their appearance: find_get_page(), find_lock_page(),
find_get_pages() and find_get_pages_contig().

So as not to slow their fast paths, tuck those checks inside the
existing checks for unlikely radix_tree_deref_slot(); except for
find_lock_page(), where it is an added test.  And make it a BUG in
find_get_pages_tag(), which is not applied to tmpfs files.

A part of the reason for eliminating shmem_readpage() earlier, was to
minimize the places where common code would need to allow for swap
entries.

The swp_entry_t known to swapfile.c must be massaged into a slightly
different form when stored in the radix tree, just as it gets massaged
into a pte_t when stored in page tables.

In an i386 kernel this limits its information (type and page offset) to
30 bits: given 32 "types" of swapfile and 4kB pagesize, that's a maximum
swapfile size of 128GB.  Which is less than the 512GB we previously
allowed with X86_PAE (where the swap entry can occupy the entire upper
32 bits of a pte_t), but not a new limitation on 32-bit without PAE; and
there's not a new limitation on 64-bit (where swap filesize is already
limited to 16TB by a 32-bit page offset).  Thirty areas of 128GB is
probably still enough swap for a 64GB 32-bit machine.

Provide swp_to_radix_entry() and radix_to_swp_entry() conversions, and
enforce filesize limit in read_swap_header(), just as for ptes.

Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 6328650b
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
@@ -76,6 +81,22 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
	return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
	swp_entry_t entry;

	entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
	return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
	unsigned long value;

	value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
	return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
}

#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
@@ -169,3 +190,5 @@ static inline int non_swap_entry(swp_entry_t entry)
	return 0;
}
#endif

#endif /* _LINUX_SWAPOPS_H */
+31 −18
Original line number Diff line number Diff line
@@ -714,9 +714,12 @@ struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
		page = radix_tree_deref_slot(pagep);
		if (unlikely(!page))
			goto out;
		if (radix_tree_deref_retry(page))
		if (radix_tree_exception(page)) {
			if (radix_tree_exceptional_entry(page))
				goto out;
			/* radix_tree_deref_retry(page) */
			goto repeat;

		}
		if (!page_cache_get_speculative(page))
			goto repeat;

@@ -753,7 +756,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)

repeat:
	page = find_get_page(mapping, offset);
	if (page) {
	if (page && !radix_tree_exception(page)) {
		lock_page(page);
		/* Has the page been truncated? */
		if (unlikely(page->mapping != mapping)) {
@@ -849,11 +852,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
		if (unlikely(!page))
			continue;

		if (radix_tree_exception(page)) {
			if (radix_tree_exceptional_entry(page))
				continue;
			/*
		 * This can only trigger when the entry at index 0 moves out
		 * of or back to the root: none yet gotten, safe to restart.
			 * radix_tree_deref_retry(page):
			 * can only trigger when entry at index 0 moves out of
			 * or back to root: none yet gotten, safe to restart.
			 */
		if (radix_tree_deref_retry(page)) {
			WARN_ON(start | i);
			goto restart;
		}
@@ -912,12 +918,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
		if (unlikely(!page))
			continue;

		if (radix_tree_exception(page)) {
			if (radix_tree_exceptional_entry(page))
				break;
			/*
		 * This can only trigger when the entry at index 0 moves out
		 * of or back to the root: none yet gotten, safe to restart.
			 * radix_tree_deref_retry(page):
			 * can only trigger when entry at index 0 moves out of
			 * or back to root: none yet gotten, safe to restart.
			 */
		if (radix_tree_deref_retry(page))
			goto restart;
		}

		if (!page_cache_get_speculative(page))
			goto repeat;
@@ -977,12 +987,15 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
		if (unlikely(!page))
			continue;

		if (radix_tree_exception(page)) {
			BUG_ON(radix_tree_exceptional_entry(page));
			/*
		 * This can only trigger when the entry at index 0 moves out
		 * of or back to the root: none yet gotten, safe to restart.
			 * radix_tree_deref_retry(page):
			 * can only trigger when entry at index 0 moves out of
			 * or back to root: none yet gotten, safe to restart.
			 */
		if (radix_tree_deref_retry(page))
			goto restart;
		}

		if (!page_cache_get_speculative(page))
			goto repeat;
+12 −8
Original line number Diff line number Diff line
@@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,

	/*
	 * Find out how many pages are allowed for a single swap
	 * device. There are two limiting factors: 1) the number of
	 * bits for the swap offset in the swp_entry_t type and
	 * 2) the number of bits in the a swap pte as defined by
	 * the different architectures. In order to find the
	 * largest possible bit mask a swap entry with swap type 0
	 * device. There are three limiting factors: 1) the number
	 * of bits for the swap offset in the swp_entry_t type, and
	 * 2) the number of bits in the swap pte as defined by the
	 * the different architectures, and 3) the number of free bits
	 * in an exceptional radix_tree entry. In order to find the
	 * largest possible bit mask, a swap entry with swap type 0
	 * and swap offset ~0UL is created, encoded to a swap pte,
	 * decoded to a swp_entry_t again and finally the swap
	 * decoded to a swp_entry_t again, and finally the swap
	 * offset is extracted. This will mask all the bits from
	 * the initial ~0UL mask that can't be encoded in either
	 * the swp_entry_t or the architecture definition of a
	 * swap pte.
	 * swap pte.  Then the same is done for a radix_tree entry.
	 */
	maxpages = swp_offset(pte_to_swp_entry(
			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
			swp_entry_to_pte(swp_entry(0, ~0UL))));
	maxpages = swp_offset(radix_to_swp_entry(
			swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;

	if (maxpages > swap_header->info.last_page) {
		maxpages = swap_header->info.last_page + 1;
		/* p->max is an unsigned int: don't overflow it */