Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 542c3118 authored by Dave Chinner's avatar Dave Chinner
Browse files

Merge branch 'xfs-dio-extend-fix' into for-next

Conflicts:
	fs/xfs/xfs_file.c
parents 6a63ef06 0cefb29e
Loading
Loading
Loading
Loading
+192 −78
Original line number Diff line number Diff line
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
	return try_to_free_buffers(page);
}

/*
 * When we map a DIO buffer, we may need to attach an ioend that describes the
 * type of write IO we are doing. This passes to the completion function the
 * operations it needs to perform. If the mapping is for an overwrite wholly
 * within the EOF then we don't need an ioend and so we don't allocate one.
 * This avoids the unnecessary overhead of allocating and freeing ioends for
 * workloads that don't require transactions on IO completion.
 *
 * If we get multiple mappings in a single IO, we might be mapping different
 * types. But because the direct IO can only have a single private pointer, we
 * need to ensure that:
 *
 * a) i) the ioend spans the entire region of unwritten mappings; or
 *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
 * b) if it contains unwritten extents, it is *permanently* marked as such
 *
 * We could do this by chaining ioends like buffered IO does, but we only
 * actually get one IO completion callback from the direct IO, and that spans
 * the entire IO regardless of how many mappings and IOs are needed to complete
 * the DIO. There is only going to be one reference to the ioend and its life
 * cycle is constrained by the DIO completion code. hence we don't need
 * reference counting here.
 */
static void
xfs_map_direct(
	struct inode		*inode,
	struct buffer_head	*bh_result,
	struct xfs_bmbt_irec	*imap,
	xfs_off_t		offset)
{
	struct xfs_ioend	*ioend;
	xfs_off_t		size = bh_result->b_size;
	int			type;

	if (ISUNWRITTEN(imap))
		type = XFS_IO_UNWRITTEN;
	else
		type = XFS_IO_OVERWRITE;

	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);

	if (bh_result->b_private) {
		ioend = bh_result->b_private;
		ASSERT(ioend->io_size > 0);
		ASSERT(offset >= ioend->io_offset);
		if (offset + size > ioend->io_offset + ioend->io_size)
			ioend->io_size = offset - ioend->io_offset + size;

		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
			ioend->io_type = XFS_IO_UNWRITTEN;

		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
					      ioend->io_size, ioend->io_type,
					      imap);
	} else if (type == XFS_IO_UNWRITTEN ||
		   offset + size > i_size_read(inode)) {
		ioend = xfs_alloc_ioend(inode, type);
		ioend->io_offset = offset;
		ioend->io_size = size;

		bh_result->b_private = ioend;
		set_buffer_defer_completion(bh_result);

		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
					   imap);
	} else {
		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
					    imap);
	}
}

/*
 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
 * is, so that we can avoid repeated get_blocks calls.
 *
 * If the mapping spans EOF, then we have to break the mapping up as the mapping
 * for blocks beyond EOF must be marked new so that sub block regions can be
 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
 * was just allocated or is unwritten, otherwise the callers would overwrite
 * existing data with zeros. Hence we have to split the mapping into a range up
 * to and including EOF, and a second mapping for beyond EOF.
 */
static void
xfs_map_trim_size(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	struct xfs_bmbt_irec	*imap,
	xfs_off_t		offset,
	ssize_t			size)
{
	xfs_off_t		mapping_size;

	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
	mapping_size <<= inode->i_blkbits;

	ASSERT(mapping_size > 0);
	if (mapping_size > size)
		mapping_size = size;
	if (offset < i_size_read(inode) &&
	    offset + mapping_size >= i_size_read(inode)) {
		/* limit mapping to block that spans EOF */
		mapping_size = roundup_64(i_size_read(inode) - offset,
					  1 << inode->i_blkbits);
	}
	if (mapping_size > LONG_MAX)
		mapping_size = LONG_MAX;

	bh_result->b_size = mapping_size;
}

STATIC int
__xfs_get_blocks(
	struct inode		*inode,
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(

			xfs_iunlock(ip, lockmode);
		}

		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
		trace_xfs_get_blocks_alloc(ip, offset, size,
				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
						   : XFS_IO_DELALLOC, &imap);
	} else if (nimaps) {
		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
		trace_xfs_get_blocks_found(ip, offset, size,
				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
						   : XFS_IO_OVERWRITE, &imap);
		xfs_iunlock(ip, lockmode);
	} else {
		trace_xfs_get_blocks_notfound(ip, offset, size);
		goto out_unlock;
	}

	if (imap.br_startblock != HOLESTARTBLOCK &&
	    imap.br_startblock != DELAYSTARTBLOCK) {
	/* trim mapping down to size requested */
	if (direct || size > (1 << inode->i_blkbits))
		xfs_map_trim_size(inode, iblock, bh_result,
				  &imap, offset, size);

	/*
		 * For unwritten extents do not report a disk address on
		 * the read case (treat as if we're reading into a hole).
	 * For unwritten extents do not report a disk address in the buffered
	 * read case (treat as if we're reading into a hole).
	 */
		if (create || !ISUNWRITTEN(&imap))
	if (imap.br_startblock != HOLESTARTBLOCK &&
	    imap.br_startblock != DELAYSTARTBLOCK &&
	    (create || !ISUNWRITTEN(&imap))) {
		xfs_map_buffer(inode, bh_result, &imap, offset);
		if (create && ISUNWRITTEN(&imap)) {
			if (direct) {
				bh_result->b_private = inode;
				set_buffer_defer_completion(bh_result);
			}
		if (ISUNWRITTEN(&imap))
			set_buffer_unwritten(bh_result);
		}
		/* direct IO needs special help */
		if (create && direct)
			xfs_map_direct(inode, bh_result, &imap, offset);
	}

	/*
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
		}
	}

	/*
	 * If this is O_DIRECT or the mpage code calling tell them how large
	 * the mapping is, so that we can avoid repeated get_blocks calls.
	 *
	 * If the mapping spans EOF, then we have to break the mapping up as the
	 * mapping for blocks beyond EOF must be marked new so that sub block
	 * regions can be correctly zeroed. We can't do this for mappings within
	 * EOF unless the mapping was just allocated or is unwritten, otherwise
	 * the callers would overwrite existing data with zeros. Hence we have
	 * to split the mapping into a range up to and including EOF, and a
	 * second mapping for beyond EOF.
	 */
	if (direct || size > (1 << inode->i_blkbits)) {
		xfs_off_t		mapping_size;

		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
		mapping_size <<= inode->i_blkbits;

		ASSERT(mapping_size > 0);
		if (mapping_size > size)
			mapping_size = size;
		if (offset < i_size_read(inode) &&
		    offset + mapping_size >= i_size_read(inode)) {
			/* limit mapping to block that spans EOF */
			mapping_size = roundup_64(i_size_read(inode) - offset,
						  1 << inode->i_blkbits);
		}
		if (mapping_size > LONG_MAX)
			mapping_size = LONG_MAX;

		bh_result->b_size = mapping_size;
	}

	return 0;

out_unlock:
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
/*
 * Complete a direct I/O write request.
 *
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
 * extents.
 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
 * wholly within the EOF and so there is nothing for us to do. Note that in this
 * case the completion can be called in interrupt context, whereas if we have an
 * ioend we will always be called in task context (i.e. from a workqueue).
 */
STATIC void
xfs_end_io_direct_write(
@@ -1455,45 +1541,73 @@ xfs_end_io_direct_write(
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ioend	*ioend = private;

	if (XFS_FORCED_SHUTDOWN(mp))
	trace_xfs_gbmap_direct_endio(ip, offset, size,
				     ioend ? ioend->io_type : 0, NULL);

	if (!ioend) {
		ASSERT(offset + size <= i_size_read(inode));
		return;
	}

	if (XFS_FORCED_SHUTDOWN(mp))
		goto out_end_io;

	/*
	 * dio completion end_io functions are only called on writes if more
	 * than 0 bytes was written.
	 */
	ASSERT(size > 0);

	/*
	 * The ioend only maps whole blocks, while the IO may be sector aligned.
	 * Hence the ioend offset/size may not match the IO offset/size exactly.
	 * Because we don't map overwrites within EOF into the ioend, the offset
	 * may not match, but only if the endio spans EOF.  Either way, write
	 * the IO sizes into the ioend so that completion processing does the
	 * right thing.
	 */
	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
	ioend->io_size = size;
	ioend->io_offset = offset;

	/*
	 * While the generic direct I/O code updates the inode size, it does
	 * so only after the end_io handler is called, which means our
	 * end_io handler thinks the on-disk size is outside the in-core
	 * size.  To prevent this just update it a little bit earlier here.
	 * The ioend tells us whether we are doing unwritten extent conversion
	 * or an append transaction that updates the on-disk file size. These
	 * cases are the only cases where we should *potentially* be needing
	 * to update the VFS inode size.
	 *
	 * We need to update the in-core inode size here so that we don't end up
	 * with the on-disk inode size being outside the in-core inode size. We
	 * have no other method of updating EOF for AIO, so always do it here
	 * if necessary.
	 *
	 * We need to lock the test/set EOF update as we can be racing with
	 * other IO completions here to update the EOF. Failing to serialise
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * that occurs.
	 */
	spin_lock(&ip->i_flags_lock);
	if (offset + size > i_size_read(inode))
		i_size_write(inode, offset + size);
	spin_unlock(&ip->i_flags_lock);

	/*
	 * For direct I/O we do not know if we need to allocate blocks or not,
	 * so we can't preallocate an append transaction, as that results in
	 * nested reservations and log space deadlocks. Hence allocate the
	 * transaction here. While this is sub-optimal and can block IO
	 * completion for some time, we're stuck with doing it this way until
	 * we can pass the ioend to the direct IO allocation callbacks and
	 * avoid nesting that way.
	 * If we are doing an append IO that needs to update the EOF on disk,
	 * do the transaction reserve now so we can use common end io
	 * processing. Stashing the error (if there is one) in the ioend will
	 * result in the ioend processing passing on the error if it is
	 * possible as we can't return it from here.
	 */
	if (private && size > 0) {
		xfs_iomap_write_unwritten(ip, offset, size);
	} else if (offset + size > ip->i_d.di_size) {
		struct xfs_trans	*tp;
		int			error;
	if (ioend->io_type == XFS_IO_OVERWRITE)
		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);

		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
		if (error) {
			xfs_trans_cancel(tp, 0);
out_end_io:
	xfs_end_io(&ioend->io_work);
	return;
}

		xfs_setfilesize(ip, tp, offset, size);
	}
}

STATIC ssize_t
xfs_vm_direct_IO(
	int			rw,
+42 −4
Original line number Diff line number Diff line
@@ -569,20 +569,41 @@ xfs_file_aio_write_checks(
	 * write.  If zeroing is needed and we are currently holding the
	 * iolock shared, we need to update it to exclusive which implies
	 * having to redo all checks before.
	 *
	 * We need to serialise against EOF updates that occur in IO
	 * completions here. We want to make sure that nobody is changing the
	 * size while we do this check until we have placed an IO barrier (i.e.
	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
	 * The spinlock effectively forms a memory barrier once we have the
	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
	 * and hence be able to correctly determine if we need to run zeroing.
	 */
	spin_lock(&ip->i_flags_lock);
	if (*pos > i_size_read(inode)) {
		bool	zero = false;

		spin_unlock(&ip->i_flags_lock);
		if (*iolock == XFS_IOLOCK_SHARED) {
			xfs_rw_iunlock(ip, *iolock);
			*iolock = XFS_IOLOCK_EXCL;
			xfs_rw_ilock(ip, *iolock);

			/*
			 * We now have an IO submission barrier in place, but
			 * AIO can do EOF updates during IO completion and hence
			 * we now need to wait for all of them to drain. Non-AIO
			 * DIO will have drained before we are given the
			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
			 * no-op.
			 */
			inode_dio_wait(inode);
			goto restart;
		}
		error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
		if (error)
			return error;
	}
	} else
		spin_unlock(&ip->i_flags_lock);

	/*
	 * Updating the timestamps will grab the ilock again from
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
	int			iolock;
	size_t			count = iov_iter_count(from);
	loff_t			pos = iocb->ki_pos;
	loff_t			end;
	struct iov_iter		data;
	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
					mp->m_rtdev_targp : mp->m_ddev_targp;

@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
	if (ret)
		goto out;
	iov_iter_truncate(from, count);
	end = pos + count - 1;

	if (mapping->nrpages) {
		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
						    pos, pos + count - 1);
						   pos, end);
		if (ret)
			goto out;
		/*
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
		 */
		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
					pos >> PAGE_CACHE_SHIFT,
					(pos + count - 1) >> PAGE_CACHE_SHIFT);
					end >> PAGE_CACHE_SHIFT);
		WARN_ON_ONCE(ret);
		ret = 0;
	}
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
	}

	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
	ret = generic_file_direct_write(iocb, from, pos);

	data = *from;
	ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);

	/* see generic_file_direct_write() for why this is necessary */
	if (mapping->nrpages) {
		invalidate_inode_pages2_range(mapping,
					      pos >> PAGE_CACHE_SHIFT,
					      end >> PAGE_CACHE_SHIFT);
	}

	if (ret > 0) {
		pos += ret;
		iov_iter_advance(from, ret);
		iocb->ki_pos = pos;
	}
out:
	xfs_rw_iunlock(ip, iolock);

+5 −0
Original line number Diff line number Diff line
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);

DECLARE_EVENT_CLASS(xfs_simple_io_class,
	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),