hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race" (e7c58097) · Commits · e / devices / android_kernel_fairphone_FP5

fs/hugetlbfs/inode.c

+33 −28

Original line number	Diff line number	Diff line
		@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
		* truncation is indicated by end of range being LLONG_MAX
		* In this case, we first scan the range and release found pages.
		* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
		* maps and global counts.
		* maps and global counts. Page faults can not race with truncation
		* in this routine. hugetlb_no_page() prevents page faults in the
		* truncated range. It checks i_size before allocation, and again after
		* with the page table lock for the page held. The same lock must be
		* acquired to unmap a page.
		* hole punch is indicated if end is not LLONG_MAX
		* In the hole punch case we scan the range and release found pages.
		* Only when releasing a page is the associated region/reserv map
		* deleted. The region/reserv map for ranges without associated
		* pages are not modified.
		*
		* Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
		* races with page faults.
		*
		* pages are not modified. Page faults can race with hole punch.
		* This is indicated if we find a mapped page.
		* Note: If the passed end of range value is beyond the end of file, but
		* not LLONG_MAX this routine still performs a hole punch operation.
		*/
		@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

		for (i = 0; i < pagevec_count(&pvec); ++i) {
		struct page *page = pvec.pages[i];
		u32 hash;

		index = page->index;
		hash = hugetlb_fault_mutex_hash(h, current->mm,
		&pseudo_vma,
		mapping, index, 0);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

		/*
		* A mapped page is impossible as callers should unmap
		* all references before calling. And, i_mmap_rwsem
		* prevents the creation of additional mappings.
		* If page is mapped, it was faulted in after being
		* unmapped in caller. Unmap (again) now after taking
		* the fault mutex. The mutex will prevent faults
		* until we finish removing the page.
		*
		* This race can only happen in the hole punch case.
		* Getting here in a truncate operation is a bug.
		*/
		VM_BUG_ON(page_mapped(page));
		if (unlikely(page_mapped(page))) {
		BUG_ON(truncate_op);

		i_mmap_lock_write(mapping);
		hugetlb_vmdelete_list(&mapping->i_mmap,
		index * pages_per_huge_page(h),
		(index + 1) * pages_per_huge_page(h));
		i_mmap_unlock_write(mapping);
		}

		lock_page(page);
		/*
		@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
		}

		unlock_page(page);
		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		}
		huge_pagevec_release(&pvec);
		cond_resched();
		@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

		static void hugetlbfs_evict_inode(struct inode *inode)
		{
		struct address_space *mapping = inode->i_mapping;
		struct resv_map *resv_map;

		/*
		* The vfs layer guarantees that there are no other users of this
		* inode. Therefore, it would be safe to call remove_inode_hugepages
		* without holding i_mmap_rwsem. We acquire and hold here to be
		* consistent with other callers. Since there will be no contention
		* on the semaphore, overhead is negligible.
		*/
		i_mmap_lock_write(mapping);
		remove_inode_hugepages(inode, 0, LLONG_MAX);
		i_mmap_unlock_write(mapping);

		resv_map = (struct resv_map *)inode->i_mapping->private_data;
		/* root inode doesn't have the resv_map, so we should check it */
		if (resv_map)
		@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
		i_mmap_lock_write(mapping);
		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
		remove_inode_hugepages(inode, offset, LLONG_MAX);
		i_mmap_unlock_write(mapping);
		remove_inode_hugepages(inode, offset, LLONG_MAX);
		return 0;
		}

		@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
		hugetlb_vmdelete_list(&mapping->i_mmap,
		hole_start >> PAGE_SHIFT,
		hole_end >> PAGE_SHIFT);
		remove_inode_hugepages(inode, hole_start, hole_end);
		i_mmap_unlock_write(mapping);
		remove_inode_hugepages(inode, hole_start, hole_end);
		inode_unlock(inode);
		}

		@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
		/* addr is the offset within the file (zero based) */
		addr = index * hpage_size;

		/*
		* fault mutex taken here, protects against fault path
		* and hole punch. inode_lock previously taken protects
		* against truncation.
		*/
		/* mutex taken here, fault path and hole punch */
		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
		index, addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

mm/hugetlb.c

+11 −10

Original line number	Diff line number	Diff line
		@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
		}

		/*
		* We can not race with truncation due to holding i_mmap_rwsem.
		* Check once here for faults beyond end of file.
		* Use page lock to guard against racing truncation
		* before we get page_table_lock.
		*/
		retry:
		page = find_lock_page(mapping, idx);
		if (!page) {
		size = i_size_read(mapping->host) >> huge_page_shift(h);
		if (idx >= size)
		goto out;

		retry:
		page = find_lock_page(mapping, idx);
		if (!page) {
		/*
		* Check for page in userfault range
		*/
		@@ -3854,6 +3854,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
		}

		ptl = huge_pte_lock(h, mm, ptep);
		size = i_size_read(mapping->host) >> huge_page_shift(h);
		if (idx >= size)
		goto backout;

		ret = 0;
		if (!huge_pte_none(huge_ptep_get(ptep)))
		@@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,

		/*
		* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
		* until finished with ptep. This serves two purposes:
		* 1) It prevents huge_pmd_unshare from being called elsewhere
		* and making the ptep no longer valid.
		* 2) It synchronizes us with file truncation.
		* until finished with ptep. This prevents huge_pmd_unshare from
		* being called elsewhere and making the ptep no longer valid.
		*
		* ptep could have already be assigned via huge_pte_offset. That
		* is OK, as huge_pte_alloc will return the same value unless