Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9f141d6e authored by Jan Kara's avatar Jan Kara Committed by Dan Williams
Browse files

dax: Call ->iomap_begin without entry lock during dax fault



Currently ->iomap_begin() handler is called with entry lock held. If the
filesystem held any locks between ->iomap_begin() and ->iomap_end()
(such as ext4 which will want to hold transaction open), this would cause
lock inversion with the iomap_apply() from standard IO path which first
calls ->iomap_begin() and only then calls ->actor() callback which grabs
entry locks for DAX (if it faults when copying from/to user provided
buffers).

Fix the problem by nesting grabbing of entry lock inside ->iomap_begin()
- ->iomap_end() pair.

Reviewed-by: default avatarRoss Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent f449b936
Loading
Loading
Loading
Loading
+66 −55
Original line number Diff line number Diff line
@@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);

static int dax_fault_return(int error)
{
	if (error == 0)
		return VM_FAULT_NOPAGE;
	if (error == -ENOMEM)
		return VM_FAULT_OOM;
	return VM_FAULT_SIGBUS;
}

/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
@@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	if (pos >= i_size_read(inode))
		return VM_FAULT_SIGBUS;

	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
	if (IS_ERR(entry)) {
		error = PTR_ERR(entry);
		goto out;
	}

	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
		flags |= IOMAP_WRITE;

@@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	 */
	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
	if (error)
		goto unlock_entry;
		return dax_fault_return(error);
	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
		error = -EIO;		/* fs corruption? */
		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
		goto finish_iomap;
	}

	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
	if (IS_ERR(entry)) {
		vmf_ret = dax_fault_return(PTR_ERR(entry));
		goto finish_iomap;
	}

@@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		}

		if (error)
			goto finish_iomap;
			goto error_unlock_entry;

		__SetPageUptodate(vmf->cow_page);
		vmf_ret = finish_fault(vmf);
		if (!vmf_ret)
			vmf_ret = VM_FAULT_DONE_COW;
		goto finish_iomap;
		goto unlock_entry;
	}

	switch (iomap.type) {
@@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		}
		error = dax_insert_mapping(mapping, iomap.bdev, sector,
				PAGE_SIZE, &entry, vma, vmf);
		/* -EBUSY is fine, somebody else faulted on the same PTE */
		if (error == -EBUSY)
			error = 0;
		break;
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
			vmf_ret = dax_load_hole(mapping, &entry, vmf);
			goto finish_iomap;
			goto unlock_entry;
		}
		/*FALLTHRU*/
	default:
@@ -1183,31 +1195,26 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		break;
	}

 error_unlock_entry:
	vmf_ret = dax_fault_return(error) | major;
 unlock_entry:
	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 finish_iomap:
	if (ops->iomap_end) {
		if (error || (vmf_ret & VM_FAULT_ERROR)) {
			/* keep previous error */
			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
					&iomap);
		} else {
			error = ops->iomap_end(inode, pos, PAGE_SIZE,
					PAGE_SIZE, flags, &iomap);
		}
		int copied = PAGE_SIZE;

		if (vmf_ret & VM_FAULT_ERROR)
			copied = 0;
		/*
		 * The fault is done by now and there's no way back (other
		 * thread may be already happily using PTE we have installed).
		 * Just ignore error from ->iomap_end since we cannot do much
		 * with it.
		 */
		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
	}
 unlock_entry:
	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if (error < 0 && error != -EBUSY)
		return VM_FAULT_SIGBUS | major;
	if (vmf_ret) {
		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
	return vmf_ret;
}
	return VM_FAULT_NOPAGE | major;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);

#ifdef CONFIG_FS_DAX_PMD
@@ -1330,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
		goto fallback;

	/*
	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
	 * the tree, for instance), it will return -EEXIST and we just fall
	 * back to 4k entries.
	 */
	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
	if (IS_ERR(entry))
		goto fallback;

	/*
	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
	 * setting up a mapping, so really we're using iomap_begin() as a way
@@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	pos = (loff_t)pgoff << PAGE_SHIFT;
	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
	if (error)
		goto unlock_entry;
		goto fallback;

	if (iomap.offset + iomap.length < pos + PMD_SIZE)
		goto finish_iomap;

	/*
	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
	 * the tree, for instance), it will return -EEXIST and we just fall
	 * back to 4k entries.
	 */
	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
	if (IS_ERR(entry))
		goto finish_iomap;

	vmf.pgoff = pgoff;
	vmf.flags = flags;
	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(write))
			goto finish_iomap;
			goto unlock_entry;
		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
				&entry);
		break;
@@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		break;
	}

 unlock_entry:
	put_locked_mapping_entry(mapping, pgoff, entry);
 finish_iomap:
	if (ops->iomap_end) {
		if (result == VM_FAULT_FALLBACK) {
			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
		int copied = PMD_SIZE;

		if (result == VM_FAULT_FALLBACK)
			copied = 0;
		/*
		 * The fault is done by now and there's no way back (other
		 * thread may be already happily using PMD we have installed).
		 * Just ignore error from ->iomap_end since we cannot do much
		 * with it.
		 */
		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
				&iomap);
		} else {
			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
					iomap_flags, &iomap);
			if (error)
				result = VM_FAULT_FALLBACK;
		}
	}
 unlock_entry:
	put_locked_mapping_entry(mapping, pgoff, entry);
 fallback:
	if (result == VM_FAULT_FALLBACK) {
		split_huge_pmd(vma, pmd, address);