Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e7c58097 authored by Mike Kravetz's avatar Mike Kravetz Committed by Linus Torvalds
Browse files

hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race"

This reverts c86aa7bb

The reverted commit caused ABBA deadlocks when file migration raced with
file eviction for specific hugetlbfs files.  This was discovered with a
modified version of the LTP move_pages12 test.

The purpose of the reverted patch was to close a long existing race
between hugetlbfs file truncation and page faults.  After more analysis
of the patch and impacted code, it was determined that i_mmap_rwsem can
not be used for all required synchronization.  Therefore, revert this
patch while working an another approach to the underlying issue.

Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com


Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reported-by: default avatarJan Stancek <jstancek@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8ab88c71
Loading
Loading
Loading
Loading
+33 −28
Original line number Diff line number Diff line
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
 * truncation is indicated by end of range being LLONG_MAX
 *	In this case, we first scan the range and release found pages.
 *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
 *	maps and global counts.
 *	maps and global counts.  Page faults can not race with truncation
 *	in this routine.  hugetlb_no_page() prevents page faults in the
 *	truncated range.  It checks i_size before allocation, and again after
 *	with the page table lock for the page held.  The same lock must be
 *	acquired to unmap a page.
 * hole punch is indicated if end is not LLONG_MAX
 *	In the hole punch case we scan the range and release found pages.
 *	Only when releasing a page is the associated region/reserv map
 *	deleted.  The region/reserv map for ranges without associated
 *	pages are not modified.
 *
 * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
 * races with page faults.
 *
 *	pages are not modified.  Page faults can race with hole punch.
 *	This is indicated if we find a mapped page.
 * Note: If the passed end of range value is beyond the end of file, but
 * not LLONG_MAX this routine still performs a hole punch operation.
 */
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

		for (i = 0; i < pagevec_count(&pvec); ++i) {
			struct page *page = pvec.pages[i];
			u32 hash;

			index = page->index;
			hash = hugetlb_fault_mutex_hash(h, current->mm,
							&pseudo_vma,
							mapping, index, 0);
			mutex_lock(&hugetlb_fault_mutex_table[hash]);

			/*
			 * A mapped page is impossible as callers should unmap
			 * all references before calling.  And, i_mmap_rwsem
			 * prevents the creation of additional mappings.
			 * If page is mapped, it was faulted in after being
			 * unmapped in caller.  Unmap (again) now after taking
			 * the fault mutex.  The mutex will prevent faults
			 * until we finish removing the page.
			 *
			 * This race can only happen in the hole punch case.
			 * Getting here in a truncate operation is a bug.
			 */
			VM_BUG_ON(page_mapped(page));
			if (unlikely(page_mapped(page))) {
				BUG_ON(truncate_op);

				i_mmap_lock_write(mapping);
				hugetlb_vmdelete_list(&mapping->i_mmap,
					index * pages_per_huge_page(h),
					(index + 1) * pages_per_huge_page(h));
				i_mmap_unlock_write(mapping);
			}

			lock_page(page);
			/*
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
			}

			unlock_page(page);
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		}
		huge_pagevec_release(&pvec);
		cond_resched();
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

static void hugetlbfs_evict_inode(struct inode *inode)
{
	struct address_space *mapping = inode->i_mapping;
	struct resv_map *resv_map;

	/*
	 * The vfs layer guarantees that there are no other users of this
	 * inode.  Therefore, it would be safe to call remove_inode_hugepages
	 * without holding i_mmap_rwsem.  We acquire and hold here to be
	 * consistent with other callers.  Since there will be no contention
	 * on the semaphore, overhead is negligible.
	 */
	i_mmap_lock_write(mapping);
	remove_inode_hugepages(inode, 0, LLONG_MAX);
	i_mmap_unlock_write(mapping);

	resv_map = (struct resv_map *)inode->i_mapping->private_data;
	/* root inode doesn't have the resv_map, so we should check it */
	if (resv_map)
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
	i_mmap_lock_write(mapping);
	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
	remove_inode_hugepages(inode, offset, LLONG_MAX);
	i_mmap_unlock_write(mapping);
	remove_inode_hugepages(inode, offset, LLONG_MAX);
	return 0;
}

@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
			hugetlb_vmdelete_list(&mapping->i_mmap,
						hole_start >> PAGE_SHIFT,
						hole_end  >> PAGE_SHIFT);
		remove_inode_hugepages(inode, hole_start, hole_end);
		i_mmap_unlock_write(mapping);
		remove_inode_hugepages(inode, hole_start, hole_end);
		inode_unlock(inode);
	}

@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
		/* addr is the offset within the file (zero based) */
		addr = index * hpage_size;

		/*
		 * fault mutex taken here, protects against fault path
		 * and hole punch.  inode_lock previously taken protects
		 * against truncation.
		 */
		/* mutex taken here, fault path and hole punch */
		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
						index, addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);
+11 −10
Original line number Diff line number Diff line
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
	}

	/*
	 * We can not race with truncation due to holding i_mmap_rwsem.
	 * Check once here for faults beyond end of file.
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
		size = i_size_read(mapping->host) >> huge_page_shift(h);
		if (idx >= size)
			goto out;

retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
		/*
		 * Check for page in userfault range
		 */
@@ -3854,6 +3854,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
	}

	ptl = huge_pte_lock(h, mm, ptep);
	size = i_size_read(mapping->host) >> huge_page_shift(h);
	if (idx >= size)
		goto backout;

	ret = 0;
	if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

	/*
	 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
	 * until finished with ptep.  This serves two purposes:
	 * 1) It prevents huge_pmd_unshare from being called elsewhere
	 *    and making the ptep no longer valid.
	 * 2) It synchronizes us with file truncation.
	 * until finished with ptep.  This prevents huge_pmd_unshare from
	 * being called elsewhere and making the ptep no longer valid.
	 *
	 * ptep could have already be assigned via huge_pte_offset.  That
	 * is OK, as huge_pte_alloc will return the same value unless