Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit deceb6cd authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds
Browse files

[PATCH] mm: follow_page with inner ptlock



Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent c34d1b4d
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
	for_each_node(i)
		md->node[i] =0;

	spin_lock(&mm->page_table_lock);
 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
		page = follow_page(mm, vaddr, 0);
		if (page) {
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
				md->anon++;
			md->node[page_to_nid(page)]++;
		}
		cond_resched();
	}
	spin_unlock(&mm->page_table_lock);
	return md;
}

+12 −8
Original line number Diff line number Diff line
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);

extern struct page * vmalloc_to_page(void *addr);
extern unsigned long vmalloc_to_pfn(void *addr);
extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
		int write);
int remap_pfn_range(struct vm_area_struct *, unsigned long,
		unsigned long, unsigned long, pgprot_t);
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
struct page *vmalloc_to_page(void *addr);
unsigned long vmalloc_to_pfn(void *addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
			unsigned long pfn, unsigned long size, pgprot_t);

struct page *follow_page(struct mm_struct *, unsigned long address,
			unsigned int foll_flags);
#define FOLL_WRITE	0x01	/* check pte is writable */
#define FOLL_TOUCH	0x02	/* mark page accessed */
#define FOLL_GET	0x04	/* do get_page on page */
#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */

#ifdef CONFIG_PROC_FS
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+2 −4
Original line number Diff line number Diff line
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
	/*
	 * Do a quick atomic lookup first - this is the fastpath.
	 */
	spin_lock(&current->mm->page_table_lock);
	page = follow_page(mm, uaddr, 0);
	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
	if (likely(page != NULL)) {
		key->shared.pgoff =
			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
		spin_unlock(&current->mm->page_table_lock);
		put_page(page);
		return 0;
	}
	spin_unlock(&current->mm->page_table_lock);

	/*
	 * Do it the general way.
+71 −81
Original line number Diff line number Diff line
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,

/*
 * Do a quick page-table lookup for a single page.
 * mm->page_table_lock must be held.
 */
struct page *follow_page(struct mm_struct *mm, unsigned long address, int write)
struct page *follow_page(struct mm_struct *mm, unsigned long address,
			unsigned int flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
	spinlock_t *ptl;
	unsigned long pfn;
	struct page *page;

	page = follow_huge_addr(mm, address, write);
	if (! IS_ERR(page))
		return page;
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		goto out;
	}

	page = NULL;
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		goto out;
		goto no_page_table;

	pud = pud_offset(pgd, address);
	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
		goto out;
		goto no_page_table;
	
	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
		goto no_page_table;

	if (pmd_huge(*pmd)) {
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
		goto out;
	if (pmd_huge(*pmd))
		return follow_huge_pmd(mm, address, pmd, write);
	}

	ptep = pte_offset_map(pmd, address);
	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
	if (!ptep)
		goto out;

	pte = *ptep;
	pte_unmap(ptep);
	if (pte_present(pte)) {
		if (write && !pte_write(pte))
			goto out;
	if (!pte_present(pte))
		goto unlock;
	if ((flags & FOLL_WRITE) && !pte_write(pte))
		goto unlock;
	pfn = pte_pfn(pte);
		if (pfn_valid(pfn)) {
	if (!pfn_valid(pfn))
		goto unlock;

	page = pfn_to_page(pfn);
			if (write && !pte_dirty(pte) &&!PageDirty(page))
	if (flags & FOLL_GET)
		get_page(page);
	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		mark_page_accessed(page);
			return page;
	}
	}

unlock:
	pte_unmap_unlock(ptep, ptl);
out:
	return NULL;
}

static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
			 unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	/* Check if the vma is for an anonymous mapping. */
	if (vma->vm_ops && vma->vm_ops->nopage)
		return 0;

	/* Check if page directory entry exists. */
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		return 1;

	pud = pud_offset(pgd, address);
	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
		return 1;

	/* Check if page middle directory entry exists. */
	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
		return 1;
	return page;

	/* There is a pte slot for 'address' in 'mm'. */
	return 0;
no_page_table:
	/*
	 * When core dumping an enormous anonymous area that nobody
	 * has touched so far, we don't want to allocate page tables.
	 */
	if (flags & FOLL_ANON) {
		page = ZERO_PAGE(address);
		if (flags & FOLL_GET)
			get_page(page);
		BUG_ON(flags & FOLL_WRITE);
	}
	return page;
}

int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		struct page **pages, struct vm_area_struct **vmas)
{
	int i;
	unsigned int flags;
	unsigned int vm_flags;

	/* 
	 * Require read or write permissions.
	 * If 'force' is set, we only require the "MAY" flags.
	 */
	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
	i = 0;

	do {
		struct vm_area_struct *vma;
		unsigned int foll_flags;

		vma = find_extend_vma(mm, start);
		if (!vma && in_gate_area(tsk, start)) {
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		}

		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
				|| !(flags & vma->vm_flags))
				|| !(vm_flags & vma->vm_flags))
			return i ? : -EFAULT;

		if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
						&start, &len, i);
			continue;
		}
		spin_lock(&mm->page_table_lock);

		foll_flags = FOLL_TOUCH;
		if (pages)
			foll_flags |= FOLL_GET;
		if (!write && !(vma->vm_flags & VM_LOCKED) &&
		    (!vma->vm_ops || !vma->vm_ops->nopage))
			foll_flags |= FOLL_ANON;

		do {
			int write_access = write;
			struct page *page;

			cond_resched_lock(&mm->page_table_lock);
			while (!(page = follow_page(mm, start, write_access))) {
				int ret;

				/*
				 * Shortcut for anonymous pages. We don't want
				 * to force the creation of pages tables for
				 * insanely big anonymously mapped areas that
				 * nobody touched so far. This is important
				 * for doing a core dump for these mappings.
				 */
				if (!write && untouched_anonymous_page(mm,vma,start)) {
					page = ZERO_PAGE(start);
					break;
				}
				spin_unlock(&mm->page_table_lock);
				ret = __handle_mm_fault(mm, vma, start, write_access);
			if (write)
				foll_flags |= FOLL_WRITE;

			cond_resched();
			while (!(page = follow_page(mm, start, foll_flags))) {
				int ret;
				ret = __handle_mm_fault(mm, vma, start,
						foll_flags & FOLL_WRITE);
				/*
				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
				 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
				 * subsequent page lookups as if they were reads.
				 */
				if (ret & VM_FAULT_WRITE)
					write_access = 0;
					foll_flags &= ~FOLL_WRITE;
				
				switch (ret & ~VM_FAULT_WRITE) {
				case VM_FAULT_MINOR:
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
				default:
					BUG();
				}
				spin_lock(&mm->page_table_lock);
			}
			if (pages) {
				pages[i] = page;
				flush_dcache_page(page);
				page_cache_get(page);
			}
			if (vmas)
				vmas[i] = vma;
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			start += PAGE_SIZE;
			len--;
		} while (len && start < vma->vm_end);
		spin_unlock(&mm->page_table_lock);
	} while (len);
	return i;
}
+2 −1
Original line number Diff line number Diff line
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)

EXPORT_SYMBOL(find_vma);

struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
struct page *follow_page(struct mm_struct *mm, unsigned long address,
			unsigned int foll_flags)
{
	return NULL;
}