Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 16d4d435 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Dave Chinner
Browse files

xfs: split direct I/O and DAX path



So far the DAX code overloaded the direct I/O code path.  There is very little
in common between the two, and untangling them allows to clean up both variants.

As a side effect we also get separate trace points for both I/O types.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent fa8d972d
Loading
Loading
Loading
Loading
+110 −29
Original line number Original line Diff line number Diff line
@@ -305,14 +305,12 @@ xfs_file_dio_aio_read(
	else
	else
		target = ip->i_mount->m_ddev_targp;
		target = ip->i_mount->m_ddev_targp;


	if (!IS_DAX(inode)) {
	/* DIO must be aligned to device logical sector size */
	/* DIO must be aligned to device logical sector size */
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
		if (iocb->ki_pos == isize)
		if (iocb->ki_pos == isize)
			return 0;
			return 0;
		return -EINVAL;
		return -EINVAL;
	}
	}
	}


	/*
	/*
	 * Locking is a bit tricky here. If we take an exclusive lock for direct
	 * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -360,13 +358,37 @@ xfs_file_dio_aio_read(
	}
	}


	data = *to;
	data = *to;
	if (IS_DAX(inode)) {
		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
				NULL, 0);
	} else {
	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
			xfs_get_blocks_direct, NULL, NULL, 0);
			xfs_get_blocks_direct, NULL, NULL, 0);
	if (ret > 0) {
		iocb->ki_pos += ret;
		iov_iter_advance(to, ret);
	}
	}
	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);

	file_accessed(iocb->ki_filp);
	return ret;
}

STATIC ssize_t
xfs_file_dax_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct address_space	*mapping = iocb->ki_filp->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct iov_iter		data = *to;
	size_t			count = iov_iter_count(to);
	ssize_t			ret = 0;

	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);

	if (!count)
		return 0; /* skip atime */

	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
	if (ret > 0) {
	if (ret > 0) {
		iocb->ki_pos += ret;
		iocb->ki_pos += ret;
		iov_iter_advance(to, ret);
		iov_iter_advance(to, ret);
@@ -399,7 +421,8 @@ xfs_file_read_iter(
	struct kiocb		*iocb,
	struct kiocb		*iocb,
	struct iov_iter		*to)
	struct iov_iter		*to)
{
{
	struct xfs_mount	*mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
	ssize_t			ret = 0;
	ssize_t			ret = 0;


	XFS_STATS_INC(mp, xs_read_calls);
	XFS_STATS_INC(mp, xs_read_calls);
@@ -407,7 +430,9 @@ xfs_file_read_iter(
	if (XFS_FORCED_SHUTDOWN(mp))
	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;
		return -EIO;


	if (iocb->ki_flags & IOCB_DIRECT)
	if (IS_DAX(inode))
		ret = xfs_file_dax_read(iocb, to);
	else if (iocb->ki_flags & IOCB_DIRECT)
		ret = xfs_file_dio_aio_read(iocb, to);
		ret = xfs_file_dio_aio_read(iocb, to);
	else
	else
		ret = xfs_file_buffered_aio_read(iocb, to);
		ret = xfs_file_buffered_aio_read(iocb, to);
@@ -755,8 +780,7 @@ xfs_file_dio_aio_write(
					mp->m_rtdev_targp : mp->m_ddev_targp;
					mp->m_rtdev_targp : mp->m_ddev_targp;


	/* DIO must be aligned to device logical sector size */
	/* DIO must be aligned to device logical sector size */
	if (!IS_DAX(inode) &&
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
		return -EINVAL;
		return -EINVAL;


	/* "unaligned" here means not aligned to a filesystem block */
	/* "unaligned" here means not aligned to a filesystem block */
@@ -825,14 +849,9 @@ xfs_file_dio_aio_write(
	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);


	data = *from;
	data = *from;
	if (IS_DAX(inode)) {
		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
				xfs_end_io_direct_write, 0);
	} else {
	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
			xfs_get_blocks_direct, xfs_end_io_direct_write,
			xfs_get_blocks_direct, xfs_end_io_direct_write,
			NULL, DIO_ASYNC_EXTEND);
			NULL, DIO_ASYNC_EXTEND);
	}


	/* see generic_file_direct_write() for why this is necessary */
	/* see generic_file_direct_write() for why this is necessary */
	if (mapping->nrpages) {
	if (mapping->nrpages) {
@@ -849,10 +868,70 @@ xfs_file_dio_aio_write(
	xfs_rw_iunlock(ip, iolock);
	xfs_rw_iunlock(ip, iolock);


	/*
	/*
	 * No fallback to buffered IO on errors for XFS. DAX can result in
	 * No fallback to buffered IO on errors for XFS, direct IO will either
	 * partial writes, but direct IO will either complete fully or fail.
	 * complete fully or fail.
	 */
	ASSERT(ret < 0 || ret == count);
	return ret;
}

STATIC ssize_t
xfs_file_dax_write(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
	struct address_space	*mapping = iocb->ki_filp->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0;
	int			unaligned_io = 0;
	int			iolock;
	struct iov_iter		data;

	/* "unaligned" here means not aligned to a filesystem block */
	if ((iocb->ki_pos & mp->m_blockmask) ||
	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
		unaligned_io = 1;
		iolock = XFS_IOLOCK_EXCL;
	} else if (mapping->nrpages) {
		iolock = XFS_IOLOCK_EXCL;
	} else {
		iolock = XFS_IOLOCK_SHARED;
	}
	xfs_rw_ilock(ip, iolock);

	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
	if (ret)
		goto out;

	/*
	 * Yes, even DAX files can have page cache attached to them:  A zeroed
	 * page is inserted into the pagecache when we have to serve a write
	 * fault on a hole.  It should never be dirtied and can simply be
	 * dropped from the pagecache once we get real data for the page.
	 */
	 */
	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
	if (mapping->nrpages) {
		ret = invalidate_inode_pages2(mapping);
		WARN_ON_ONCE(ret);
	}

	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
		iolock = XFS_IOLOCK_SHARED;
	}

	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);

	data = *from;
	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
			xfs_end_io_direct_write, 0);
	if (ret > 0) {
		iocb->ki_pos += ret;
		iov_iter_advance(from, ret);
	}
out:
	xfs_rw_iunlock(ip, iolock);
	return ret;
	return ret;
}
}


@@ -934,7 +1013,9 @@ xfs_file_write_iter(
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;
		return -EIO;


	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
	if (IS_DAX(inode))
		ret = xfs_file_dax_write(iocb, from);
	else if (iocb->ki_flags & IOCB_DIRECT)
		ret = xfs_file_dio_aio_write(iocb, from);
		ret = xfs_file_dio_aio_write(iocb, from);
	else
	else
		ret = xfs_file_buffered_aio_write(iocb, from);
		ret = xfs_file_buffered_aio_write(iocb, from);
+2 −0
Original line number Original line Diff line number Diff line
@@ -1164,8 +1164,10 @@ DEFINE_EVENT(xfs_file_class, name, \
	TP_ARGS(ip, count, offset))
	TP_ARGS(ip, count, offset))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_read);


DECLARE_EVENT_CLASS(xfs_page_class,
DECLARE_EVENT_CLASS(xfs_page_class,