Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5e25c269 authored by Eryu Guan's avatar Eryu Guan Committed by Darrick J. Wong
Browse files

fs: invalidate page cache after end_io() in dio completion



Commit 332391a9 ("fs: Fix page cache inconsistency when mixing
buffered and AIO DIO") moved page cache invalidation from
iomap_dio_rw() to iomap_dio_complete() for iomap based direct write
path, but before the dio->end_io() call, and it re-introdued the bug
fixed by commit c771c14b ("iomap: invalidate page caches should
be after iomap_dio_complete() in direct write").

I found this because fstests generic/418 started failing on XFS with
v4.14-rc3 kernel, which is the regression test for this specific
bug.

So similarly, fix it by moving dio->end_io() (which does the
unwritten extent conversion) before page cache invalidation, to make
sure next buffer read reads the final real allocations not unwritten
extents. I also add some comments about why should end_io() go first
in case we get it wrong again in the future.

Note that, there's no such problem in the non-iomap based direct
write path, because we didn't remove the page cache invalidation
after the ->direct_IO() in generic_file_direct_write() call, but I
decided to fix dio_complete() too so we don't leave a landmine
there, also be consistent with iomap_dio_complete().

Fixes: 332391a9 ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
Signed-off-by: default avatarEryu Guan <eguan@redhat.com>
Reviewed-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Reviewed-by: default avatarLukas Czerner <lczerner@redhat.com>
parent 793d7dbe
Loading
Loading
Loading
Loading
+12 −8
Original line number Original line Diff line number Diff line
@@ -259,12 +259,24 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
	if (ret == 0)
	if (ret == 0)
		ret = transferred;
		ret = transferred;


	if (dio->end_io) {
		// XXX: ki_pos??
		err = dio->end_io(dio->iocb, offset, ret, dio->private);
		if (err)
			ret = err;
	}

	/*
	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 * this invalidation fails, tough, the write still worked...
	 *
	 * And this page cache invalidation has to be after dio->end_io(), as
	 * some filesystems convert unwritten extents to real allocations in
	 * end_io() when necessary, otherwise a racing buffer read would cache
	 * zeros from unwritten extents.
	 */
	 */
	if (ret > 0 && dio->op == REQ_OP_WRITE &&
	if (ret > 0 && dio->op == REQ_OP_WRITE &&
	    dio->inode->i_mapping->nrpages) {
	    dio->inode->i_mapping->nrpages) {
@@ -274,14 +286,6 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
		WARN_ON_ONCE(err);
		WARN_ON_ONCE(err);
	}
	}


	if (dio->end_io) {

		// XXX: ki_pos??
		err = dio->end_io(dio->iocb, offset, ret, dio->private);
		if (err)
			ret = err;
	}

	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
		inode_dio_end(dio->inode);
		inode_dio_end(dio->inode);


+24 −17
Original line number Original line Diff line number Diff line
@@ -714,23 +714,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
{
	struct kiocb *iocb = dio->iocb;
	struct kiocb *iocb = dio->iocb;
	struct inode *inode = file_inode(iocb->ki_filp);
	struct inode *inode = file_inode(iocb->ki_filp);
	loff_t offset = iocb->ki_pos;
	ssize_t ret;
	ssize_t ret;


	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 */
	if (!dio->error &&
	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
		ret = invalidate_inode_pages2_range(inode->i_mapping,
				iocb->ki_pos >> PAGE_SHIFT,
				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
	}

	if (dio->end_io) {
	if (dio->end_io) {
		ret = dio->end_io(iocb,
		ret = dio->end_io(iocb,
				dio->error ? dio->error : dio->size,
				dio->error ? dio->error : dio->size,
@@ -742,12 +728,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
	if (likely(!ret)) {
	if (likely(!ret)) {
		ret = dio->size;
		ret = dio->size;
		/* check for short read */
		/* check for short read */
		if (iocb->ki_pos + ret > dio->i_size &&
		if (offset + ret > dio->i_size &&
		    !(dio->flags & IOMAP_DIO_WRITE))
		    !(dio->flags & IOMAP_DIO_WRITE))
			ret = dio->i_size - iocb->ki_pos;
			ret = dio->i_size - offset;
		iocb->ki_pos += ret;
		iocb->ki_pos += ret;
	}
	}


	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 *
	 * And this page cache invalidation has to be after dio->end_io(), as
	 * some filesystems convert unwritten extents to real allocations in
	 * end_io() when necessary, otherwise a racing buffer read would cache
	 * zeros from unwritten extents.
	 */
	if (!dio->error &&
	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
		int err;
		err = invalidate_inode_pages2_range(inode->i_mapping,
				offset >> PAGE_SHIFT,
				(offset + dio->size - 1) >> PAGE_SHIFT);
		WARN_ON_ONCE(err);
	}

	inode_dio_end(file_inode(iocb->ki_filp));
	inode_dio_end(file_inode(iocb->ki_filp));
	kfree(dio);
	kfree(dio);