Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a5430dda authored by Jérôme Glisse's avatar Jérôme Glisse Committed by Linus Torvalds
Browse files

mm/migrate: support un-addressable ZONE_DEVICE page in migration

Allow to unmap and restore special swap entry of un-addressable
ZONE_DEVICE memory.

Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com


Signed-off-by: default avatarJérôme Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8c3328f1
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,

#ifdef CONFIG_MIGRATION

/*
 * Watch out for PAE architecture, which has an unsigned long, and might not
 * have enough bits to store all physical address and flags. So far we have
 * enough room for all our flags.
 */
#define MIGRATE_PFN_VALID	(1UL << 0)
#define MIGRATE_PFN_MIGRATE	(1UL << 1)
#define MIGRATE_PFN_LOCKED	(1UL << 2)
#define MIGRATE_PFN_WRITE	(1UL << 3)
#define MIGRATE_PFN_ERROR	(1UL << 4)
#define MIGRATE_PFN_SHIFT	5
#define MIGRATE_PFN_DEVICE	(1UL << 4)
#define MIGRATE_PFN_ERROR	(1UL << 5)
#define MIGRATE_PFN_SHIFT	6

static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
{
+121 −28
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/memremap.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -237,7 +238,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
		if (is_write_migration_entry(entry))
			pte = maybe_mkwrite(pte, vma);

		if (unlikely(is_zone_device_page(new)) &&
		    is_device_private_page(new)) {
			entry = make_device_private_entry(new, pte_write(pte));
			pte = swp_entry_to_pte(entry);
		} else
			flush_dcache_page(new);

#ifdef CONFIG_HUGETLB_PAGE
		if (PageHuge(new)) {
			pte = pte_mkhuge(pte);
@@ -2205,17 +2212,40 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
		pte = *ptep;
		pfn = pte_pfn(pte);

		if (pte_none(pte)) {
			mpfn = pfn = 0;
			goto next;
		}

		if (!pte_present(pte)) {
			mpfn = pfn = 0;

			/*
			 * Only care about unaddressable device page special
			 * page table entry. Other special swap entries are not
			 * migratable, and we ignore regular swapped page.
			 */
			entry = pte_to_swp_entry(pte);
			if (!is_device_private_entry(entry))
				goto next;

			page = device_private_entry_to_page(entry);
			mpfn = migrate_pfn(page_to_pfn(page))|
				MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
			if (is_write_device_private_entry(entry))
				mpfn |= MIGRATE_PFN_WRITE;
		} else {
			page = vm_normal_page(migrate->vma, addr, pte);
			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
		}

		/* FIXME support THP */
		page = vm_normal_page(migrate->vma, addr, pte);
		if (!page || !page->mapping || PageTransCompound(page)) {
			mpfn = pfn = 0;
			goto next;
		}
		pfn = page_to_pfn(page);

		/*
		 * By getting a reference on the page we pin it and that blocks
@@ -2228,8 +2258,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
		 */
		get_page(page);
		migrate->cpages++;
		mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
		mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;

		/*
		 * Optimize for the common case where page is only mapped once
@@ -2256,10 +2284,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
			 */
			page_remove_rmap(page, false);
			put_page(page);

			if (pte_present(pte))
				unmapped++;
		}

next:
		migrate->dst[migrate->npages] = 0;
		migrate->src[migrate->npages++] = mpfn;
	}
	arch_leave_lazy_mmu_mode();
@@ -2329,6 +2360,28 @@ static bool migrate_vma_check_page(struct page *page)
	if (PageCompound(page))
		return false;

	/* Page from ZONE_DEVICE have one extra reference */
	if (is_zone_device_page(page)) {
		/*
		 * Private page can never be pin as they have no valid pte and
		 * GUP will fail for those. Yet if there is a pending migration
		 * a thread might try to wait on the pte migration entry and
		 * will bump the page reference count. Sadly there is no way to
		 * differentiate a regular pin from migration wait. Hence to
		 * avoid 2 racing thread trying to migrate back to CPU to enter
		 * infinite loop (one stoping migration because the other is
		 * waiting on pte migration entry). We always return true here.
		 *
		 * FIXME proper solution is to rework migration_entry_wait() so
		 * it does not need to take a reference on page.
		 */
		if (is_device_private_page(page))
			return true;

		/* Other ZONE_DEVICE memory type are not supported */
		return false;
	}

	if ((page_count(page) - extra) > page_mapcount(page))
		return false;

@@ -2379,6 +2432,8 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
			migrate->src[i] |= MIGRATE_PFN_LOCKED;
		}

		/* ZONE_DEVICE pages are not on LRU */
		if (!is_zone_device_page(page)) {
			if (!PageLRU(page) && allow_drain) {
				/* Drain CPU's pagevec */
				lru_add_drain_all();
@@ -2399,20 +2454,29 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
				continue;
			}

			/* Drop the reference we took in collect */
			put_page(page);
		}

		if (!migrate_vma_check_page(page)) {
			if (remap) {
				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				migrate->cpages--;
				restore++;

				if (!is_zone_device_page(page)) {
					get_page(page);
					putback_lru_page(page);
				}
			} else {
				migrate->src[i] = 0;
				unlock_page(page);
				migrate->cpages--;

				if (!is_zone_device_page(page))
					putback_lru_page(page);
				else
					put_page(page);
			}
		}
	}
@@ -2483,6 +2547,9 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
		unlock_page(page);
		restore--;

		if (is_zone_device_page(page))
			put_page(page);
		else
			putback_lru_page(page);
	}
}
@@ -2514,6 +2581,26 @@ static void migrate_vma_pages(struct migrate_vma *migrate)

		mapping = page_mapping(page);

		if (is_zone_device_page(newpage)) {
			if (is_device_private_page(newpage)) {
				/*
				 * For now only support private anonymous when
				 * migrating to un-addressable device memory.
				 */
				if (mapping) {
					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
					continue;
				}
			} else {
				/*
				 * Other types of ZONE_DEVICE page are not
				 * supported.
				 */
				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				continue;
			}
		}

		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
		if (r != MIGRATEPAGE_SUCCESS)
			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -2554,10 +2641,16 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
		unlock_page(page);
		migrate->cpages--;

		if (is_zone_device_page(page))
			put_page(page);
		else
			putback_lru_page(page);

		if (newpage != page) {
			unlock_page(newpage);
			if (is_zone_device_page(newpage))
				put_page(newpage);
			else
				putback_lru_page(newpage);
		}
	}
+10 −0
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
		if (!is_swap_pte(*pvmw->pte))
			return false;
		entry = pte_to_swp_entry(*pvmw->pte);

		if (!is_migration_entry(entry))
			return false;
		if (migration_entry_to_page(entry) - pvmw->page >=
@@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
		WARN_ON_ONCE(1);
#endif
	} else {
		if (is_swap_pte(*pvmw->pte)) {
			swp_entry_t entry;

			entry = pte_to_swp_entry(*pvmw->pte);
			if (is_device_private_entry(entry) &&
			    device_private_entry_to_page(entry) == pvmw->page)
				return true;
		}

		if (!pte_present(*pvmw->pte))
			return false;

+26 −0
Original line number Diff line number Diff line
@@ -63,6 +63,7 @@
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>

#include <asm/tlbflush.h>

@@ -1346,6 +1347,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
		return true;

	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
	    is_zone_device_page(page) && !is_device_private_page(page))
		return true;

	if (flags & TTU_SPLIT_HUGE_PMD) {
		split_huge_pmd_address(vma, address,
				flags & TTU_SPLIT_FREEZE, page);
@@ -1403,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
		address = pvmw.address;


		if (IS_ENABLED(CONFIG_MIGRATION) &&
		    (flags & TTU_MIGRATION) &&
		    is_zone_device_page(page)) {
			swp_entry_t entry;
			pte_t swp_pte;

			pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);

			/*
			 * Store the pfn of the page in a special migration
			 * pte. do_swap_page() will wait until the migration
			 * pte is removed and then restart fault handling.
			 */
			entry = make_migration_entry(page, 0);
			swp_pte = swp_entry_to_pte(entry);
			if (pte_soft_dirty(pteval))
				swp_pte = pte_swp_mksoft_dirty(swp_pte);
			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
			goto discard;
		}

		if (!(flags & TTU_IGNORE_ACCESS)) {
			if (ptep_clear_flush_young_notify(vma, address,
						pvmw.pte)) {