Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4759d386 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull DAX updates from Dan Williams:
 "The completion of Jan's DAX work for 4.10.

  As I mentioned in the libnvdimm-for-4.10 pull request, these are some
  final fixes for the DAX dirty-cacheline-tracking invalidation work
  that was merged through the -mm, ext4, and xfs trees in -rc1. These
  patches were prepared prior to the merge window, but we waited for
  4.10-rc1 to have a stable merge base after all the prerequisites were
  merged.

  Quoting Jan on the overall changes in these patches:

     "So I'd like all these 6 patches to go for rc2. The first three
      patches fix invalidation of exceptional DAX entries (a bug which
      is there for a long time) - without these patches data loss can
      occur on power failure even though user called fsync(2). The other
      three patches change locking of DAX faults so that ->iomap_begin()
      is called in a more relaxed locking context and we are safe to
      start a transaction there for ext4"

  These have received a build success notification from the kbuild
  robot, and pass the latest libnvdimm unit tests. There have not been
  any -next releases since -rc1, so they have not appeared there"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  ext4: Simplify DAX fault path
  dax: Call ->iomap_begin without entry lock during dax fault
  dax: Finish fault completely when loading holes
  dax: Avoid page invalidation races and unnecessary radix tree traversals
  mm: Invalidate DAX radix tree entries only if appropriate
  ext2: Return BH_New buffers for zeroed blocks
parents 238d1d0f 1db17542
Loading
Loading
Loading
Loading
+154 −89
Original line number Original line Diff line number Diff line
@@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
}
}


static int __dax_invalidate_mapping_entry(struct address_space *mapping,
					  pgoff_t index, bool trunc)
{
	int ret = 0;
	void *entry;
	struct radix_tree_root *page_tree = &mapping->page_tree;

	spin_lock_irq(&mapping->tree_lock);
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	if (!entry || !radix_tree_exceptional_entry(entry))
		goto out;
	if (!trunc &&
	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
	     radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
		goto out;
	radix_tree_delete(page_tree, index);
	mapping->nrexceptional--;
	ret = 1;
out:
	put_unlocked_mapping_entry(mapping, index, entry);
	spin_unlock_irq(&mapping->tree_lock);
	return ret;
}
/*
/*
 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 * entry to get unlocked before deleting it.
 * entry to get unlocked before deleting it.
 */
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
{
	void *entry;
	int ret = __dax_invalidate_mapping_entry(mapping, index, true);


	spin_lock_irq(&mapping->tree_lock);
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	/*
	/*
	 * This gets called from truncate / punch_hole path. As such, the caller
	 * This gets called from truncate / punch_hole path. As such, the caller
	 * must hold locks protecting against concurrent modifications of the
	 * must hold locks protecting against concurrent modifications of the
@@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
	 * caller has seen exceptional entry for this index, we better find it
	 * caller has seen exceptional entry for this index, we better find it
	 * at that index as well...
	 * at that index as well...
	 */
	 */
	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
	WARN_ON_ONCE(!ret);
		spin_unlock_irq(&mapping->tree_lock);
	return ret;
		return 0;
}
}
	radix_tree_delete(&mapping->page_tree, index);

/*
 * Invalidate exceptional DAX entry if easily possible. This handles DAX
 * entries for invalidate_inode_pages() so we evict the entry only if we can
 * do so without blocking.
 */
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
{
	int ret = 0;
	void *entry, **slot;
	struct radix_tree_root *page_tree = &mapping->page_tree;

	spin_lock_irq(&mapping->tree_lock);
	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
	if (!entry || !radix_tree_exceptional_entry(entry) ||
	    slot_locked(mapping, slot))
		goto out;
	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
		goto out;
	radix_tree_delete(page_tree, index);
	mapping->nrexceptional--;
	mapping->nrexceptional--;
	ret = 1;
out:
	spin_unlock_irq(&mapping->tree_lock);
	spin_unlock_irq(&mapping->tree_lock);
	if (ret)
		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
	return ret;
}


	return 1;
/*
 * Invalidate exceptional DAX entry if it is clean.
 */
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
				      pgoff_t index)
{
	return __dax_invalidate_mapping_entry(mapping, index, false);
}
}


/*
/*
@@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 * otherwise it will simply fall out of the page cache under memory
 * otherwise it will simply fall out of the page cache under memory
 * pressure without ever having been dirtied.
 * pressure without ever having been dirtied.
 */
 */
static int dax_load_hole(struct address_space *mapping, void *entry,
static int dax_load_hole(struct address_space *mapping, void **entry,
			 struct vm_fault *vmf)
			 struct vm_fault *vmf)
{
{
	struct page *page;
	struct page *page;
	int ret;


	/* Hole page already exists? Return it...  */
	/* Hole page already exists? Return it...  */
	if (!radix_tree_exceptional_entry(entry)) {
	if (!radix_tree_exceptional_entry(*entry)) {
		vmf->page = entry;
		page = *entry;
		return VM_FAULT_LOCKED;
		goto out;
	}
	}


	/* This will replace locked radix tree entry with a hole page */
	/* This will replace locked radix tree entry with a hole page */
@@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
				   vmf->gfp_mask | __GFP_ZERO);
				   vmf->gfp_mask | __GFP_ZERO);
	if (!page)
	if (!page)
		return VM_FAULT_OOM;
		return VM_FAULT_OOM;
 out:
	vmf->page = page;
	vmf->page = page;
	return VM_FAULT_LOCKED;
	ret = finish_fault(vmf);
	vmf->page = NULL;
	*entry = page;
	if (!ret) {
		/* Grab reference for PTE that is now referencing the page */
		get_page(page);
		return VM_FAULT_NOPAGE;
	}
	return ret;
}
}


static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
		return -EIO;
		return -EIO;


	/*
	 * Write can allocate block for an area which has a hole page mapped
	 * into page tables. We have to tear down these mappings so that data
	 * written by write(2) is visible in mmap.
	 */
	if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
		invalidate_inode_pages2_range(inode->i_mapping,
					      pos >> PAGE_SHIFT,
					      (end - 1) >> PAGE_SHIFT);
	}

	while (pos < end) {
	while (pos < end) {
		unsigned offset = pos & (PAGE_SIZE - 1);
		unsigned offset = pos & (PAGE_SIZE - 1);
		struct blk_dax_ctl dax = { 0 };
		struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
	if (iov_iter_rw(iter) == WRITE)
	if (iov_iter_rw(iter) == WRITE)
		flags |= IOMAP_WRITE;
		flags |= IOMAP_WRITE;


	/*
	 * Yes, even DAX files can have page cache attached to them:  A zeroed
	 * page is inserted into the pagecache when we have to serve a write
	 * fault on a hole.  It should never be dirtied and can simply be
	 * dropped from the pagecache once we get real data for the page.
	 *
	 * XXX: This is racy against mmap, and there's nothing we can do about
	 * it. We'll eventually need to shift this down even further so that
	 * we can check if we allocated blocks over a hole first.
	 */
	if (mapping->nrpages) {
		ret = invalidate_inode_pages2_range(mapping,
				pos >> PAGE_SHIFT,
				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
	}

	while (iov_iter_count(iter)) {
	while (iov_iter_count(iter)) {
		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
				iter, dax_iomap_actor);
				iter, dax_iomap_actor);
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);
EXPORT_SYMBOL_GPL(dax_iomap_rw);


static int dax_fault_return(int error)
{
	if (error == 0)
		return VM_FAULT_NOPAGE;
	if (error == -ENOMEM)
		return VM_FAULT_OOM;
	return VM_FAULT_SIGBUS;
}

/**
/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	if (pos >= i_size_read(inode))
	if (pos >= i_size_read(inode))
		return VM_FAULT_SIGBUS;
		return VM_FAULT_SIGBUS;


	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
	if (IS_ERR(entry)) {
		error = PTR_ERR(entry);
		goto out;
	}

	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
		flags |= IOMAP_WRITE;
		flags |= IOMAP_WRITE;


@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	 */
	 */
	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
	if (error)
	if (error)
		goto unlock_entry;
		return dax_fault_return(error);
	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
		error = -EIO;		/* fs corruption? */
		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
		goto finish_iomap;
	}

	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
	if (IS_ERR(entry)) {
		vmf_ret = dax_fault_return(PTR_ERR(entry));
		goto finish_iomap;
		goto finish_iomap;
	}
	}


@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		}
		}


		if (error)
		if (error)
			goto finish_iomap;
			goto error_unlock_entry;


		__SetPageUptodate(vmf->cow_page);
		__SetPageUptodate(vmf->cow_page);
		vmf_ret = finish_fault(vmf);
		vmf_ret = finish_fault(vmf);
		if (!vmf_ret)
		if (!vmf_ret)
			vmf_ret = VM_FAULT_DONE_COW;
			vmf_ret = VM_FAULT_DONE_COW;
		goto finish_iomap;
		goto unlock_entry;
	}
	}


	switch (iomap.type) {
	switch (iomap.type) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		}
		}
		error = dax_insert_mapping(mapping, iomap.bdev, sector,
		error = dax_insert_mapping(mapping, iomap.bdev, sector,
				PAGE_SIZE, &entry, vma, vmf);
				PAGE_SIZE, &entry, vma, vmf);
		/* -EBUSY is fine, somebody else faulted on the same PTE */
		if (error == -EBUSY)
			error = 0;
		break;
		break;
	case IOMAP_UNWRITTEN:
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
	case IOMAP_HOLE:
		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
			vmf_ret = dax_load_hole(mapping, entry, vmf);
			vmf_ret = dax_load_hole(mapping, &entry, vmf);
			break;
			goto unlock_entry;
		}
		}
		/*FALLTHRU*/
		/*FALLTHRU*/
	default:
	default:
@@ -1128,32 +1195,26 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		break;
		break;
	}
	}


 error_unlock_entry:
	vmf_ret = dax_fault_return(error) | major;
 unlock_entry:
	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 finish_iomap:
 finish_iomap:
	if (ops->iomap_end) {
	if (ops->iomap_end) {
		if (error || (vmf_ret & VM_FAULT_ERROR)) {
		int copied = PAGE_SIZE;
			/* keep previous error */

			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
		if (vmf_ret & VM_FAULT_ERROR)
					&iomap);
			copied = 0;
		} else {
		/*
			error = ops->iomap_end(inode, pos, PAGE_SIZE,
		 * The fault is done by now and there's no way back (other
					PAGE_SIZE, flags, &iomap);
		 * thread may be already happily using PTE we have installed).
		}
		 * Just ignore error from ->iomap_end since we cannot do much
		 * with it.
		 */
		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
	}
	}
 unlock_entry:
	if (vmf_ret != VM_FAULT_LOCKED || error)
		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if (error < 0 && error != -EBUSY)
		return VM_FAULT_SIGBUS | major;
	if (vmf_ret) {
		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
	return vmf_ret;
	return vmf_ret;
}
}
	return VM_FAULT_NOPAGE | major;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
EXPORT_SYMBOL_GPL(dax_iomap_fault);


#ifdef CONFIG_FS_DAX_PMD
#ifdef CONFIG_FS_DAX_PMD
@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
		goto fallback;
		goto fallback;


	/*
	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
	 * the tree, for instance), it will return -EEXIST and we just fall
	 * back to 4k entries.
	 */
	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
	if (IS_ERR(entry))
		goto fallback;

	/*
	/*
	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
	 * setting up a mapping, so really we're using iomap_begin() as a way
	 * setting up a mapping, so really we're using iomap_begin() as a way
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	pos = (loff_t)pgoff << PAGE_SHIFT;
	pos = (loff_t)pgoff << PAGE_SHIFT;
	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
	if (error)
	if (error)
		goto unlock_entry;
		goto fallback;

	if (iomap.offset + iomap.length < pos + PMD_SIZE)
	if (iomap.offset + iomap.length < pos + PMD_SIZE)
		goto finish_iomap;
		goto finish_iomap;


	/*
	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
	 * the tree, for instance), it will return -EEXIST and we just fall
	 * back to 4k entries.
	 */
	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
	if (IS_ERR(entry))
		goto finish_iomap;

	vmf.pgoff = pgoff;
	vmf.pgoff = pgoff;
	vmf.flags = flags;
	vmf.flags = flags;
	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
	case IOMAP_UNWRITTEN:
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(write))
		if (WARN_ON_ONCE(write))
			goto finish_iomap;
			goto unlock_entry;
		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
				&entry);
				&entry);
		break;
		break;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		break;
		break;
	}
	}


 unlock_entry:
	put_locked_mapping_entry(mapping, pgoff, entry);
 finish_iomap:
 finish_iomap:
	if (ops->iomap_end) {
	if (ops->iomap_end) {
		if (result == VM_FAULT_FALLBACK) {
		int copied = PMD_SIZE;
			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,

		if (result == VM_FAULT_FALLBACK)
			copied = 0;
		/*
		 * The fault is done by now and there's no way back (other
		 * thread may be already happily using PMD we have installed).
		 * Just ignore error from ->iomap_end since we cannot do much
		 * with it.
		 */
		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
				&iomap);
				&iomap);
		} else {
			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
					iomap_flags, &iomap);
			if (error)
				result = VM_FAULT_FALLBACK;
		}
	}
	}
 unlock_entry:
	put_locked_mapping_entry(mapping, pgoff, entry);
 fallback:
 fallback:
	if (result == VM_FAULT_FALLBACK) {
	if (result == VM_FAULT_FALLBACK) {
		split_huge_pmd(vma, pmd, address);
		split_huge_pmd(vma, pmd, address);
+1 −2
Original line number Original line Diff line number Diff line
@@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
			mutex_unlock(&ei->truncate_mutex);
			mutex_unlock(&ei->truncate_mutex);
			goto cleanup;
			goto cleanup;
		}
		}
	} else {
		*new = true;
	}
	}
	*new = true;


	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
	mutex_unlock(&ei->truncate_mutex);
	mutex_unlock(&ei->truncate_mutex);
+10 −38
Original line number Original line Diff line number Diff line
@@ -258,7 +258,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	int result;
	int result;
	handle_t *handle = NULL;
	struct inode *inode = file_inode(vma->vm_file);
	struct inode *inode = file_inode(vma->vm_file);
	struct super_block *sb = inode->i_sb;
	struct super_block *sb = inode->i_sb;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -266,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
	if (write) {
	if (write) {
		sb_start_pagefault(sb);
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
		file_update_time(vma->vm_file);
	}
	down_read(&EXT4_I(inode)->i_mmap_sem);
	down_read(&EXT4_I(inode)->i_mmap_sem);
		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
						EXT4_DATA_TRANS_BLOCKS(sb));
	} else
		down_read(&EXT4_I(inode)->i_mmap_sem);

	if (IS_ERR(handle))
		result = VM_FAULT_SIGBUS;
	else
	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);

	if (write) {
		if (!IS_ERR(handle))
			ext4_journal_stop(handle);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	if (write)
		sb_end_pagefault(sb);
		sb_end_pagefault(sb);
	} else
		up_read(&EXT4_I(inode)->i_mmap_sem);


	return result;
	return result;
}
}
@@ -292,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
						pmd_t *pmd, unsigned int flags)
						pmd_t *pmd, unsigned int flags)
{
{
	int result;
	int result;
	handle_t *handle = NULL;
	struct inode *inode = file_inode(vma->vm_file);
	struct inode *inode = file_inode(vma->vm_file);
	struct super_block *sb = inode->i_sb;
	struct super_block *sb = inode->i_sb;
	bool write = flags & FAULT_FLAG_WRITE;
	bool write = flags & FAULT_FLAG_WRITE;
@@ -300,27 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
	if (write) {
	if (write) {
		sb_start_pagefault(sb);
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
		file_update_time(vma->vm_file);
	}
	down_read(&EXT4_I(inode)->i_mmap_sem);
	down_read(&EXT4_I(inode)->i_mmap_sem);
		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
				ext4_chunk_trans_blocks(inode,
							PMD_SIZE / PAGE_SIZE));
	} else
		down_read(&EXT4_I(inode)->i_mmap_sem);

	if (IS_ERR(handle))
		result = VM_FAULT_SIGBUS;
	else {
	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
				     &ext4_iomap_ops);
				     &ext4_iomap_ops);
	}

	if (write) {
		if (!IS_ERR(handle))
			ext4_journal_stop(handle);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	if (write)
		sb_end_pagefault(sb);
		sb_end_pagefault(sb);
	} else
		up_read(&EXT4_I(inode)->i_mmap_sem);


	return result;
	return result;
}
}
+3 −0
Original line number Original line Diff line number Diff line
@@ -41,6 +41,9 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			struct iomap_ops *ops);
			struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
				      pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
		pgoff_t index, void *entry, bool wake_all);
		pgoff_t index, void *entry, bool wake_all);


+61 −14
Original line number Original line Diff line number Diff line
@@ -24,20 +24,12 @@
#include <linux/rmap.h>
#include <linux/rmap.h>
#include "internal.h"
#include "internal.h"


static void clear_exceptional_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
				    pgoff_t index, void *entry)
			       void *entry)
{
{
	struct radix_tree_node *node;
	struct radix_tree_node *node;
	void **slot;
	void **slot;


	/* Handled by shmem itself */
	if (shmem_mapping(mapping))
		return;

	if (dax_mapping(mapping)) {
		dax_delete_mapping_entry(mapping, index);
		return;
	}
	spin_lock_irq(&mapping->tree_lock);
	spin_lock_irq(&mapping->tree_lock);
	/*
	/*
	 * Regular page slots are stabilized by the page lock even
	 * Regular page slots are stabilized by the page lock even
@@ -55,6 +47,56 @@ static void clear_exceptional_entry(struct address_space *mapping,
	spin_unlock_irq(&mapping->tree_lock);
	spin_unlock_irq(&mapping->tree_lock);
}
}


/*
 * Unconditionally remove exceptional entry. Usually called from truncate path.
 */
static void truncate_exceptional_entry(struct address_space *mapping,
				       pgoff_t index, void *entry)
{
	/* Handled by shmem itself */
	if (shmem_mapping(mapping))
		return;

	if (dax_mapping(mapping)) {
		dax_delete_mapping_entry(mapping, index);
		return;
	}
	clear_shadow_entry(mapping, index, entry);
}

/*
 * Invalidate exceptional entry if easily possible. This handles exceptional
 * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
 * clean entries.
 */
static int invalidate_exceptional_entry(struct address_space *mapping,
					pgoff_t index, void *entry)
{
	/* Handled by shmem itself */
	if (shmem_mapping(mapping))
		return 1;
	if (dax_mapping(mapping))
		return dax_invalidate_mapping_entry(mapping, index);
	clear_shadow_entry(mapping, index, entry);
	return 1;
}

/*
 * Invalidate exceptional entry if clean. This handles exceptional entries for
 * invalidate_inode_pages2() so for DAX it evicts only clean entries.
 */
static int invalidate_exceptional_entry2(struct address_space *mapping,
					 pgoff_t index, void *entry)
{
	/* Handled by shmem itself */
	if (shmem_mapping(mapping))
		return 1;
	if (dax_mapping(mapping))
		return dax_invalidate_mapping_entry_sync(mapping, index);
	clear_shadow_entry(mapping, index, entry);
	return 1;
}

/**
/**
 * do_invalidatepage - invalidate part or all of a page
 * do_invalidatepage - invalidate part or all of a page
 * @page: the page which is affected
 * @page: the page which is affected
@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
				break;
				break;


			if (radix_tree_exceptional_entry(page)) {
			if (radix_tree_exceptional_entry(page)) {
				clear_exceptional_entry(mapping, index, page);
				truncate_exceptional_entry(mapping, index,
							   page);
				continue;
				continue;
			}
			}


@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
			}
			}


			if (radix_tree_exceptional_entry(page)) {
			if (radix_tree_exceptional_entry(page)) {
				clear_exceptional_entry(mapping, index, page);
				truncate_exceptional_entry(mapping, index,
							   page);
				continue;
				continue;
			}
			}


@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
				break;
				break;


			if (radix_tree_exceptional_entry(page)) {
			if (radix_tree_exceptional_entry(page)) {
				clear_exceptional_entry(mapping, index, page);
				invalidate_exceptional_entry(mapping, index,
							     page);
				continue;
				continue;
			}
			}


@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
				break;
				break;


			if (radix_tree_exceptional_entry(page)) {
			if (radix_tree_exceptional_entry(page)) {
				clear_exceptional_entry(mapping, index, page);
				if (!invalidate_exceptional_entry2(mapping,
								   index, page))
					ret = -EBUSY;
				continue;
				continue;
			}
			}